From c6f4d33c29db8f16d965684c05656c0ca19d1d7e Mon Sep 17 00:00:00 2001 From: wgilmart Date: Tue, 6 Aug 2019 10:18:32 -0700 Subject: [PATCH 1/2] add logic files for new sizes --- .../archive/vega20_Cijk_Ailk_Bjlk_SB.yaml | 23851 +++++- .../archive/vega20_Cijk_Ailk_Bljk_SB.yaml | 11785 ++- .../archive/vega20_Cijk_Alik_Bljk_SB.yaml | 12830 ++- .../asm_full/vega20_Cijk_Ailk_Bjlk_SB.yaml | 64568 +++++++++++++-- .../asm_full/vega20_Cijk_Ailk_Bljk_SB.yaml | 68626 ++++++++++------ .../asm_full/vega20_Cijk_Alik_Bljk_SB.yaml | 54380 ++++++++---- scripts/performance/sgemm_bert.sh | 29 + scripts/performance/sgemm_dlrm.sh | 29 + scripts/performance/sgemm_phantom.sh | 291 + scripts/performance/sgemm_winograd.sh | 343 + 10 files changed, 188624 insertions(+), 48108 deletions(-) create mode 100755 scripts/performance/sgemm_bert.sh create mode 100755 scripts/performance/sgemm_dlrm.sh create mode 100755 scripts/performance/sgemm_phantom.sh create mode 100755 scripts/performance/sgemm_winograd.sh diff --git a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml index 0f1158fda..fba37ff22 100644 --- a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml @@ -16658,8 +16658,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -16822,8 +16822,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -16982,8 +16982,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17146,8 +17146,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17306,8 +17306,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17470,8 +17470,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17630,8 +17630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17790,8 +17790,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17950,8 +17950,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18114,8 +18114,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18274,8 +18274,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18434,8 +18434,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18594,8 +18594,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18758,8 +18758,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18925,8 +18925,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19086,8 +19086,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19247,8 +19247,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19412,8 +19412,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19573,8 +19573,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19734,8 +19734,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19895,8 +19895,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20056,8 +20056,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20221,8 +20221,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20386,8 +20386,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20547,8 +20547,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20708,8 +20708,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20869,8 +20869,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21030,8 +21030,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21191,8 +21191,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21352,8 +21352,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21513,8 +21513,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21674,8 +21674,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21835,8 +21835,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21996,8 +21996,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22157,8 +22157,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22322,8 +22322,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22487,8 +22487,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22650,8 +22650,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22817,8 +22817,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22982,8 +22982,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23145,8 +23145,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23312,8 +23312,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23475,8 +23475,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23642,8 +23642,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23807,8 +23807,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23970,8 +23970,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24137,8 +24137,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24300,8 +24300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24467,8 +24467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24630,8 +24630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24797,8 +24797,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24966,8 +24966,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -25133,8 +25133,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -25298,8 +25298,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -25344,155 +25344,22716 @@ _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 -- [2, 3, 0, 1] -- - - [1024, 128, 1, 128] - - [4, 1028.02] - - - [4, 704, 1, 1280] - - [43, 363.355] - - - [4, 1856, 1, 3328] - - [43, 579.434] - - - [1856, 448, 1, 3328] - - [80, 6966.73] - - - [2944, 4288, 1, 1280] - - [75, 9057.88] - - - [2368, 64, 1, 3328] - - [36, 5837.56] - - - [2368, 5888, 1, 256] - - [80, 9111.06] - - - [128, 64, 1, 256] - - [42, 374.491] - - - [5888, 1024, 1, 1280] - - [85, 8570.44] - - - [128, 6784, 1, 3328] - - [48, 7703.86] - - - [64, 4, 1, 256] - - [94, 11.2219] - - - [5888, 1856, 1, 3328] - - [80, 9394.3] - - - [5056, 704, 1, 256] - - [83, 8026.89] - - - [5888, 2944, 1, 3328] - - [73, 7608.11] - - - [1856, 4288, 1, 256] - - [74, 8986.32] - - - [1024, 5056, 1, 128] - - [66, 3898.24] - - - [5056, 5056, 1, 3328] - - [74, 9536.75] - - - [1408, 5888, 1, 1280] - - [75, 9279.09] - - - [2368, 448, 1, 128] - - [67, 2474.32] - - - [1024, 3584, 1, 3328] - - [77, 9258.48] - - - [4, 2944, 1, 1280] - - [29, 611.74] - - - [1408, 64, 1, 128] - - [0, 858.21] - - - [256, 4288, 1, 3328] - - [80, 7615.98] - - - [5888, 1408, 1, 1280] - - [73, 9620.29] - - - [704, 1856, 1, 3328] - - [74, 9033.65] - - - [4, 1408, 1, 128] - - [87, 24.355] - - - [1024, 2368, 1, 256] - - [74, 7526.15] - - - [1408, 1856, 1, 1280] - - [77, 8324.09] - - - [1408, 64, 1, 1280] - - [48, 4681.14] - - - [448, 1024, 1, 1280] - - [74, 7112.43] - - - [256, 1408, 1, 3328] - - [80, 5825.41] - - - [5056, 5056, 1, 1280] - - [83, 9233.55] - - - [448, 5056, 1, 256] - - [75, 7003.17] - - - [704, 1856, 1, 1280] - - [74, 8877.28] - - - [128, 5056, 1, 128] - - [66, 2301.04] - - - [2368, 128, 1, 256] - - [74, 3848.94] - - - [1856, 1408, 1, 128] - - [69, 4202.21] - - - [64, 5056, 1, 256] - - [75, 3109.52] - - - [6784, 256, 1, 3328] - - [74, 6388.43] - - - [6784, 4288, 1, 3328] - - [85, 9114.57] - - - [4288, 448, 1, 256] - - [78, 5782.95] - - - [64, 704, 1, 128] - - [11, 379.419] - - - [1856, 2368, 1, 3328] - - [74, 9128.36] - - - [4288, 2944, 1, 1280] - - [80, 9182.23] - - - [704, 5056, 1, 1280] - - [74, 9071.47] - - - [2368, 704, 1, 3328] - - [80, 7731.33] - - - [256, 5888, 1, 256] - - [74, 7920.28] - - - [1856, 4288, 1, 3328] - - [80, 9329.97] - - - [256, 2944, 1, 256] - - [81, 5312.17] - - - [5888, 1024, 1, 256] - - [72, 6710.87] - - - [448, 64, 1, 1280] - - [47, 2814.43] - - - [448, 5056, 1, 3328] - - [74, 8255.43] - - - [3584, 4, 1, 1280] - - [23, 640.715] - - - [2944, 64, 1, 256] - - [22, 2621.44] - - - [128, 4, 1, 1280] - - [94, 86.2316] - - - [1408, 2944, 1, 256] - - [74, 8848.89] - - - [256, 1856, 1, 1280] - - [74, 7366.45] - - - [6784, 5056, 1, 3328] - - [85, 8332.06] - - - [5056, 5056, 1, 256] - - [80, 9171.64] - - - [1408, 6784, 1, 128] - - [66, 5079.09] - - - [64, 1024, 1, 1280] - - [38, 3679.21] - - - [2944, 4, 1, 256] - - [29, 369.443] - - - [704, 5056, 1, 128] - - [66, 4509.17] - - - [4, 2368, 1, 1280] - - [23, 569.744] - - - [2368, 2944, 1, 1280] - - [85, 7451.04] - - - [128, 3584, 1, 1280] - - [83, 6071.16] - - - [6784, 6784, 1, 1280] - - [80, 9535.64] - - - [1024, 256, 1, 3328] - - [74, 5742.58] - - - [1408, 4288, 1, 1280] - - [83, 8254.99] - - - [3584, 4288, 1, 1280] - - [85, 9651.09] - - - [2368, 704, 1, 1280] - - [80, 8291.3] - - - [5056, 4288, 1, 3328] - - [72, 9406.26] - - - [3584, 2368, 1, 3328] - - [80, 9350.22] - - - [64, 704, 1, 1280] - - [47, 3384.49] + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW1_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR0_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 235 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW2_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 237 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO1_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 238 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 242 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 243 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 244 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 245 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 246 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 247 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 248 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 249 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 250 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 251 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 252 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 253 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 254 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 255 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 256 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 257 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 258 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 259 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 260 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 261 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 262 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 263 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 264 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 265 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 266 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 267 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 268 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 269 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 270 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 271 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 272 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 273 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 274 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 275 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW4_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 276 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 277 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 278 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 279 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 280 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 281 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 282 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 283 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 284 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 285 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 286 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 287 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 288 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 289 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 290 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 291 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 292 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 293 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 294 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 295 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 296 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_8_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 297 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 298 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 299 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [1024, 128, 1, 128] + - [4, 1028.02] + - - [4, 704, 1, 1280] + - [43, 363.355] + - - [4, 1856, 1, 3328] + - [43, 579.434] + - - [1856, 448, 1, 3328] + - [80, 6966.73] + - - [2944, 4288, 1, 1280] + - [75, 9057.88] + - - [2368, 64, 1, 3328] + - [36, 5837.56] + - - [2368, 5888, 1, 256] + - [80, 9111.06] + - - [128, 64, 1, 256] + - [42, 374.491] + - - [5888, 1024, 1, 1280] + - [85, 8570.44] + - - [128, 6784, 1, 3328] + - [48, 7703.86] + - - [64, 4, 1, 256] + - [94, 11.2219] + - - [5888, 1856, 1, 3328] + - [80, 9394.3] + - - [5056, 704, 1, 256] + - [83, 8026.89] + - - [5888, 2944, 1, 3328] + - [73, 7608.11] + - - [1856, 4288, 1, 256] + - [74, 8986.32] + - - [1024, 5056, 1, 128] + - [66, 3898.24] + - - [5056, 5056, 1, 3328] + - [74, 9536.75] + - - [1408, 5888, 1, 1280] + - [75, 9279.09] + - - [2368, 448, 1, 128] + - [67, 2474.32] + - - [1024, 3584, 1, 3328] + - [77, 9258.48] + - - [4, 2944, 1, 1280] + - [29, 611.74] + - - [1408, 64, 1, 128] + - [0, 858.21] + - - [256, 4288, 1, 3328] + - [80, 7615.98] + - - [5888, 1408, 1, 1280] + - [73, 9620.29] + - - [704, 1856, 1, 3328] + - [74, 9033.65] + - - [4, 1408, 1, 128] + - [87, 24.355] + - - [1024, 2368, 1, 256] + - [74, 7526.15] + - - [1408, 1856, 1, 1280] + - [77, 8324.09] + - - [1408, 64, 1, 1280] + - [48, 4681.14] + - - [448, 1024, 1, 1280] + - [74, 7112.43] + - - [256, 1408, 1, 3328] + - [80, 5825.41] + - - [5056, 5056, 1, 1280] + - [83, 9233.55] + - - [448, 5056, 1, 256] + - [75, 7003.17] + - - [704, 1856, 1, 1280] + - [74, 8877.28] + - - [128, 5056, 1, 128] + - [66, 2301.04] + - - [2368, 128, 1, 256] + - [74, 3848.94] + - - [1856, 1408, 1, 128] + - [69, 4202.21] + - - [64, 5056, 1, 256] + - [75, 3109.52] + - - [6784, 256, 1, 3328] + - [74, 6388.43] + - - [6784, 4288, 1, 3328] + - [85, 9114.57] + - - [4288, 448, 1, 256] + - [78, 5782.95] + - - [64, 704, 1, 128] + - [11, 379.419] + - - [1856, 2368, 1, 3328] + - [74, 9128.36] + - - [4288, 2944, 1, 1280] + - [80, 9182.23] + - - [704, 5056, 1, 1280] + - [74, 9071.47] + - - [2368, 704, 1, 3328] + - [80, 7731.33] + - - [256, 5888, 1, 256] + - [74, 7920.28] + - - [1856, 4288, 1, 3328] + - [80, 9329.97] + - - [256, 2944, 1, 256] + - [81, 5312.17] + - - [5888, 1024, 1, 256] + - [72, 6710.87] + - - [448, 64, 1, 1280] + - [47, 2814.43] + - - [448, 5056, 1, 3328] + - [74, 8255.43] + - - [3584, 4, 1, 1280] + - [23, 640.715] + - - [2944, 64, 1, 256] + - [22, 2621.44] + - - [128, 4, 1, 1280] + - [94, 86.2316] + - - [1408, 2944, 1, 256] + - [74, 8848.89] + - - [256, 1856, 1, 1280] + - [74, 7366.45] + - - [6784, 5056, 1, 3328] + - [85, 8332.06] + - - [5056, 5056, 1, 256] + - [80, 9171.64] + - - [1408, 6784, 1, 128] + - [66, 5079.09] + - - [64, 1024, 1, 1280] + - [38, 3679.21] + - - [2944, 4, 1, 256] + - [29, 369.443] + - - [704, 5056, 1, 128] + - [66, 4509.17] + - - [4, 2368, 1, 1280] + - [23, 569.744] + - - [2368, 2944, 1, 1280] + - [85, 7451.04] + - - [128, 3584, 1, 1280] + - [83, 6071.16] + - - [6784, 6784, 1, 1280] + - [80, 9535.64] + - - [1408, 4288, 1, 1280] + - [83, 8254.99] + - - [3584, 4288, 1, 1280] + - [85, 9651.09] + - - [2368, 704, 1, 1280] + - [80, 8291.3] + - - [5056, 4288, 1, 3328] + - [72, 9406.26] + - - [3584, 2368, 1, 3328] + - [80, 9350.22] + - - [64, 704, 1, 1280] + - [47, 3384.49] - - [4288, 256, 1, 256] - [80, 5593.52] - - [2944, 128, 1, 128] @@ -26959,8 +49520,6 @@ - [56, 5129.81] - - [2368, 3584, 1, 256] - [74, 8998.7] - - - [1024, 256, 1, 1280] - - [81, 3566.58] - - [5056, 3584, 1, 1280] - [75, 9345.07] - - [448, 4, 1, 3328] @@ -29453,4 +52012,886 @@ - [162, 5765.37] - - [4096, 3072, 1, 128] - [164, 8869.01] + - - [768, 3072, 1, 4096] + - [176, 10028.7] + - - [64, 256, 192, 256] + - [170, 8791.55] + - - [768, 2, 1, 16] + - [173, 4.95484] + - - [768, 768, 1, 64] + - [169, 3469.55] + - - [768, 768, 1, 4096] + - [177, 7475.0] + - - [768, 30522, 1, 1280] + - [180, 10296.9] + - - [64, 128, 384, 128] + - [170, 7660.83] + - - [768, 30522, 1, 320] + - [178, 10007.9] + - - [768, 768, 1, 32] + - [167, 2359.3] + - - [3072, 768, 1, 4096] + - [176, 10033.7] + - - [768, 30522, 1, 640] + - [179, 10206.7] + - - [64, 64, 768, 64] + - [168, 5494.72] + - - [768, 768, 1, 640] + - [177, 6721.64] + - - [768, 768, 1, 16] + - [166, 1203.72] + - - [768, 768, 1, 1280] + - [175, 7138.57] + - - [768, 2, 1, 32] + - [171, 11.8154] + - - [2048, 2048, 1, 512] + - [191, 9607.57] + - - [512, 32, 1, 200] + - [184, 422.268] + - - [1024, 1, 1, 200] + - [187, 24.6154] + - - [1600, 1024, 1, 512] + - [182, 8115.91] + - - [560, 1024, 1, 200] + - [181, 4810.74] + - - [1024, 1024, 1, 512] + - [190, 8614.74] + - - [2048, 1, 1, 512] + - [185, 80.9086] + - - [512, 512, 1, 200] + - [183, 4398.39] + - - [100, 2048, 1, 512] + - [188, 4443.12] + - - [1024, 1024, 1, 200] + - [189, 6990.51] + - - [1024, 64, 1, 512] + - [186, 2853.27] + - - [1024, 256, 1, 18944] + - [210, 9196.41] + - - [256, 3328, 1, 8976] + - [200, 8299.26] + - - [1024, 256, 1, 4352] + - [208, 8813.74] + - - [256, 9728, 1, 8976] + - [203, 9638.48] + - - [1024, 256, 1, 3072] + - [210, 8640.63] + - - [768, 2048, 1, 256] + - [202, 8662.93] + - - [1024, 256, 1, 19968] + - [207, 9220.86] + - - [256, 12800, 1, 8976] + - [197, 9418.42] + - - [1024, 256, 1, 3328] + - [211, 8682.48] + - - [256, 10240, 1, 8976] + - [204, 10137.7] + - - [1024, 256, 1, 15104] + - [209, 9167.03] + - - [256, 10496, 1, 8976] + - [197, 9858.38] + - - [1024, 256, 1, 2816] + - [212, 8575.71] + - - [1024, 256, 1, 4608] + - [207, 8861.21] + - - [256, 11264, 1, 8976] + - [194, 9627.69] + - - [1024, 256, 1, 6400] + - [207, 8985.23] + - - [1024, 256, 1, 16128] + - [207, 9170.26] + - - [256, 44505, 1, 8976] + - [201, 10331.8] + - - [256, 6144, 1, 8976] + - [204, 10395.0] + - - [1024, 256, 1, 5120] + - [209, 8881.53] + - - [1024, 256, 1, 7936] + - [212, 9023.14] + - - [256, 3840, 1, 8976] + - [199, 9541.28] + - - [1024, 256, 1, 21248] + - [207, 9209.72] + - - [1024, 256, 1, 12032] + - [209, 9156.17] + - - [256, 8192, 1, 8976] + - [206, 10374.4] + - - [1024, 256, 1, 3584] + - [208, 8712.2] + - - [1024, 256, 1, 14336] + - [209, 9162.51] + - - [256, 7168, 1, 8976] + - [195, 9554.86] + - - [1024, 256, 1, 13568] + - [207, 9165.04] + - - [256, 4096, 1, 8976] + - [199, 10146.6] + - - [1024, 256, 1, 4096] + - [208, 8783.88] + - - [256, 2560, 1, 8976] + - [198, 8381.56] + - - [256, 20992, 1, 8976] + - [197, 9989.86] + - - [256, 4352, 1, 8976] + - [198, 9634.92] + - - [256, 33536, 1, 8976] + - [197, 10218.1] + - - [256, 3584, 1, 8976] + - [199, 8924.5] + - - [256, 26112, 1, 8976] + - [198, 10272.3] + - - [256, 14336, 1, 8976] + - [202, 10217.3] + - - [1024, 256, 1, 14848] + - [209, 9185.19] + - - [1024, 256, 1, 8448] + - [210, 9025.89] + - - [1024, 256, 1, 28672] + - [207, 9256.4] + - - [1024, 256, 1, 5632] + - [207, 8932.69] + - - [256, 22016, 1, 8976] + - [202, 10151.9] + - - [1024, 256, 1, 33536] + - [207, 9243.07] + - - [256, 5120, 1, 8976] + - [193, 9418.05] + - - [256, 11520, 1, 8976] + - [200, 9701.0] + - - [256, 19968, 1, 8976] + - [198, 10228.0] + - - [1024, 256, 1, 5376] + - [209, 8892.52] + - - [1024, 256, 1, 22016] + - [207, 9244.24] + - - [256, 8960, 1, 8976] + - [198, 9841.31] + - - [1024, 256, 1, 15872] + - [207, 9223.15] + - - [256, 17408, 1, 8976] + - [202, 9785.77] + - - [256, 5632, 1, 8976] + - [202, 9564.22] + - - [256, 32512, 1, 8976] + - [201, 10357.9] + - - [256, 11008, 1, 8976] + - [194, 9445.13] + - - [1024, 256, 1, 6144] + - [209, 8955.81] + - - [256, 4864, 1, 8976] + - [194, 8979.35] + - - [256, 15104, 1, 8976] + - [197, 10007.0] + - - [1024, 256, 1, 9984] + - [207, 9110.43] + - - [256, 1280, 1, 8976] + - [193, 5944.34] + - - [1024, 256, 1, 1024] + - [209, 7005.1] + - - [1024, 256, 1, 9728] + - [209, 9066.19] + - - [1024, 256, 1, 10496] + - [207, 9118.05] + - - [256, 11776, 1, 8976] + - [204, 9911.64] + - - [256, 12544, 1, 8976] + - [197, 9235.25] + - - [1024, 256, 1, 17152] + - [207, 9152.21] + - - [1024, 256, 1, 11520] + - [209, 9146.77] + - - [1024, 256, 1, 21504] + - [209, 9207.42] + - - [256, 17152, 1, 8976] + - [196, 9654.71] + - - [1024, 256, 1, 17408] + - [207, 9181.17] + - - [256, 15872, 1, 8976] + - [205, 10086.4] + - - [256, 18688, 1, 8976] + - [198, 9612.47] + - - [256, 5888, 1, 8976] + - [202, 9988.33] + - - [512, 2048, 1, 256] + - [192, 7678.36] + - - [1024, 256, 1, 7680] + - [210, 9032.96] + - - [1024, 256, 1, 1280] + - [212, 7767.23] + - - [256, 14848, 1, 8976] + - [198, 9852.66] + - - [256, 9984, 1, 8976] + - [204, 9908.87] + - - [256, 20480, 1, 8976] + - [202, 10337.1] + - - [1024, 256, 1, 8192] + - [209, 9044.32] + - - [1024, 256, 1, 19712] + - [208, 9184.18] + - - [256, 13568, 1, 8976] + - [198, 9927.82] + - - [256, 13312, 1, 8976] + - [197, 9757.91] + - - [256, 2816, 1, 8976] + - [197, 9191.43] + - - [1024, 256, 1, 2304] + - [208, 8444.91] + - - [256, 21248, 1, 8976] + - [198, 10127.5] + - - [256, 16128, 1, 8976] + - [206, 10238.4] + - - [256, 512, 36, 98] + - [229, 7994.85] + - - [64, 192, 36, 25088] + - [298, 8613.89] + - - [128, 128, 64, 25] + - [228, 2540.15] + - - [256, 256, 64, 56] + - [229, 6924.56] + - - [512, 486, 36, 800] + - [236, 8994.84] + - - [512, 512, 36, 1568] + - [247, 9872.38] + - - [64, 192, 64, 3200] + - [292, 9295.89] + - - [256, 384, 36, 4096] + - [292, 9334.61] + - - [128, 256, 64, 32] + - [231, 4279.9] + - - [64, 128, 64, 23104] + - [298, 10103.1] + - - [128, 256, 64, 9] + - [222, 1709.63] + - - [256, 512, 36, 784] + - [232, 9520.73] + - - [256, 324, 36, 32] + - [270, 4473.38] + - - [512, 512, 36, 33] + - [241, 5925.17] + - - [16, 32, 36, 5760] + - [245, 1448.8] + - - [192, 384, 64, 128] + - [292, 8618.43] + - - [512, 512, 64, 72] + - [248, 8260.12] + - - [128, 128, 64, 1600] + - [221, 9008.38] + - - [512, 512, 36, 128] + - [292, 8871.62] + - - [192, 384, 64, 2304] + - [221, 9657.16] + - - [384, 256, 64, 450] + - [257, 9538.93] + - - [3, 64, 36, 6272] + - [245, 509.784] + - - [3, 64, 64, 2888] + - [274, 708.621] + - - [384, 256, 64, 2304] + - [257, 10287.5] + - - [512, 512, 64, 144] + - [292, 9226.7] + - - [256, 256, 36, 6272] + - [232, 9607.28] + - - [80, 192, 64, 4608] + - [293, 7347.93] + - - [64, 64, 36, 3136] + - [280, 5959.05] + - - [256, 384, 64, 2304] + - [257, 10283.4] + - - [512, 512, 36, 66] + - [241, 7618.08] + - - [128, 256, 64, 800] + - [267, 9611.15] + - - [64, 128, 36, 30] + - [223, 1242.61] + - - [192, 256, 36, 512] + - [292, 8657.97] + - - [256, 512, 64, 200] + - [292, 9153.87] + - - [256, 512, 64, 25] + - [270, 5349.88] + - - [3, 64, 64, 46208] + - [273, 808.562] + - - [128, 256, 36, 1568] + - [265, 8528.62] + - - [64, 128, 64, 11552] + - [298, 9997.0] + - - [128, 192, 64, 946] + - [292, 9198.38] + - - [64, 192, 64, 12800] + - [253, 9000.66] + - - [224, 224, 64, 128] + - [230, 6312.07] + - - [128, 256, 64, 288] + - [292, 8697.87] + - - [64, 64, 64, 826] + - [235, 6650.21] + - - [256, 384, 64, 1152] + - [267, 10106.8] + - - [3, 64, 64, 92416] + - [273, 812.031] + - - [32, 32, 36, 43808] + - [214, 2813.09] + - - [160, 320, 64, 288] + - [224, 8090.86] + - - [1, 16, 36, 23040] + - [261, 42.6667] + - - [128, 256, 36, 128] + - [239, 6049.48] + - - [128, 128, 64, 3360] + - [292, 9199.96] + - - [128, 128, 64, 420] + - [292, 8131.5] + - - [64, 128, 64, 361] + - [229, 6937.98] + - - [512, 512, 36, 16] + - [285, 3797.66] + - - [384, 256, 36, 800] + - [226, 9151.65] + - - [192, 384, 36, 4096] + - [226, 8867.57] + - - [64, 64, 64, 1600] + - [278, 7931.74] + - - [256, 384, 64, 576] + - [258, 9745.8] + - - [512, 512, 64, 14] + - [241, 3638.18] + - - [512, 512, 36, 8] + - [216, 2279.51] + - - [512, 486, 64, 128] + - [232, 8337.83] + - - [1, 16, 64, 640] + - [266, 49.9512] + - - [64, 96, 64, 288] + - [291, 5707.97] + - - [96, 96, 36, 1568] + - [260, 6866.75] + - - [256, 256, 36, 128] + - [264, 7703.82] + - - [64, 128, 36, 53824] + - [252, 6331.31] + - - [256, 256, 36, 32] + - [248, 4648.86] + - - [192, 256, 64, 288] + - [292, 8987.79] + - - [256, 256, 36, 16] + - [262, 2912.71] + - - [128, 256, 36, 3200] + - [265, 8680.27] + - - [160, 320, 64, 512] + - [224, 8449.44] + - - [128, 160, 36, 512] + - [235, 7214.97] + - - [96, 96, 36, 2592] + - [230, 7104.79] + - - [64, 96, 64, 800] + - [260, 7268.32] + - - [147, 64, 36, 18816] + - [276, 7116.26] + - - [160, 320, 36, 512] + - [230, 7874.82] + - - [256, 512, 36, 4] + - [269, 1034.78] + - - [96, 128, 64, 946] + - [252, 7901.07] + - - [256, 324, 64, 1568] + - [257, 8589.53] + - - [128, 128, 64, 50] + - [248, 4070.56] + - - [35, 96, 36, 8960] + - [242, 4207.3] + - - [32, 64, 36, 43808] + - [283, 4390.81] + - - [160, 224, 36, 128] + - [230, 5446.92] + - - [64, 64, 64, 81] + - [255, 2391.18] + - - [256, 256, 36, 3200] + - [221, 9559.55] + - - [256, 256, 36, 210] + - [232, 8414.61] + - - [192, 384, 64, 576] + - [292, 9468.75] + - - [512, 512, 64, 800] + - [267, 10096.4] + - - [512, 24, 36, 800] + - [218, 4761.77] + - - [64, 64, 64, 13216] + - [279, 8491.41] + - - [192, 224, 64, 1152] + - [235, 8769.06] + - - [256, 256, 64, 1152] + - [257, 9988.09] + - - [512, 486, 64, 512] + - [267, 9254.67] + - - [128, 128, 36, 784] + - [230, 7468.06] + - - [256, 512, 64, 1600] + - [254, 10232.5] + - - [512, 512, 64, 9] + - [248, 2599.78] + - - [96, 128, 64, 288] + - [260, 6599.43] + - - [64, 96, 36, 512] + - [260, 5073.75] + - - [256, 512, 36, 1568] + - [292, 9637.81] + - - [128, 128, 64, 400] + - [292, 8192.0] + - - [128, 128, 64, 800] + - [292, 8716.34] + - - [96, 128, 36, 512] + - [280, 6756.93] + - - [16, 32, 36, 360] + - [243, 754.036] + - - [128, 256, 64, 3200] + - [257, 10222.5] + - - [96, 128, 64, 800] + - [260, 7967.9] + - - [256, 512, 64, 4] + - [222, 1097.99] + - - [256, 256, 64, 450] + - [267, 9347.45] + - - [64, 64, 64, 3200] + - [278, 8518.08] + - - [192, 224, 64, 128] + - [238, 7035.17] + - - [128, 128, 64, 288] + - [292, 7751.28] + - - [256, 256, 64, 72] + - [248, 7489.83] + - - [96, 208, 36, 512] + - [260, 6939.11] + - - [128, 256, 36, 3136] + - [235, 8669.33] + - - [64, 64, 36, 3520] + - [230, 6007.47] + - - [64, 128, 36, 1568] + - [293, 6897.7] + - - [160, 320, 64, 242] + - [219, 7873.17] + - - [192, 192, 36, 512] + - [230, 7707.32] + - - [512, 512, 36, 512] + - [292, 9582.42] + - - [1, 16, 64, 10240] + - [244, 71.3511] + - - [128, 128, 36, 512] + - [230, 7149.38] + - - [512, 512, 36, 256] + - [221, 9384.4] + - - [512, 512, 36, 1024] + - [215, 9777.89] + - - [96, 208, 64, 1152] + - [293, 7850.9] + - - [128, 192, 64, 3200] + - [221, 9490.82] + - - [256, 256, 36, 4096] + - [226, 9585.46] + - - [160, 160, 64, 288] + - [260, 7299.8] + - - [256, 256, 64, 896] + - [257, 9850.33] + - - [128, 256, 64, 242] + - [292, 8391.38] + - - [128, 128, 36, 440] + - [235, 6274.72] + - - [96, 128, 36, 1568] + - [280, 7875.03] + - - [192, 384, 36, 1024] + - [226, 8715.72] + - - [64, 96, 36, 10368] + - [297, 7478.59] + - - [128, 256, 64, 100] + - [241, 7084.97] + - - [112, 224, 36, 2048] + - [234, 7555.92] + - - [384, 256, 64, 1152] + - [257, 10102.3] + - - [192, 384, 36, 128] + - [292, 7543.04] + - - [128, 128, 36, 7040] + - [265, 7600.6] + - - [128, 256, 64, 1568] + - [257, 10005.9] + - - [128, 128, 36, 1568] + - [249, 7848.3] + - - [128, 256, 64, 72] + - [272, 6553.6] + - - [256, 256, 36, 12544] + - [286, 9365.04] + - - [256, 256, 36, 105] + - [248, 7286.06] + - - [128, 256, 36, 392] + - [235, 7625.69] + - - [64, 64, 64, 5408] + - [278, 8882.67] + - - [3, 64, 36, 25088] + - [245, 528.942] + - - [384, 256, 36, 1024] + - [292, 9182.75] + - - [35, 96, 36, 13440] + - [299, 4110.29] + - - [128, 256, 64, 1152] + - [257, 9804.87] + - - [256, 324, 64, 32] + - [270, 5043.63] + - - [160, 224, 64, 128] + - [284, 6046.15] + - - [192, 224, 36, 2592] + - [282, 8878.68] + - - [96, 96, 64, 1152] + - [260, 8035.45] + - - [32, 64, 36, 90] + - [217, 964.465] + - - [64, 128, 64, 2888] + - [232, 9047.23] + - - [256, 384, 36, 800] + - [292, 9154.02] + - - [512, 512, 64, 4] + - [289, 1233.62] + - - [192, 320, 36, 128] + - [229, 7388.19] + - - [64, 128, 36, 480] + - [293, 5653.27] + - - [192, 384, 64, 242] + - [292, 9079.99] + - - [256, 486, 64, 32] + - [285, 5909.18] + - - [147, 64, 64, 9702] + - [294, 7319.69] + - - [512, 512, 64, 64] + - [228, 8179.02] + - - [64, 192, 64, 3698] + - [221, 9287.89] + - - [73, 192, 64, 10439] + - [252, 6668.02] + - - [1, 16, 36, 1440] + - [268, 33.4452] + - - [128, 256, 36, 512] + - [235, 7989.15] + - - [512, 512, 64, 576] + - [267, 9951.89] + - - [64, 64, 36, 12544] + - [283, 5872.77] + - - [128, 128, 36, 880] + - [280, 7597.26] + - - [192, 224, 36, 128] + - [238, 6451.2] + - - [64, 64, 64, 800] + - [278, 6916.73] + - - [64, 128, 36, 12544] + - [256, 6395.88] + - - [64, 64, 36, 1568] + - [230, 5536.66] + - - [160, 160, 36, 512] + - [230, 7345.26] + - - [512, 24, 64, 512] + - [220, 5242.88] + - - [3, 64, 36, 3136] + - [245, 475.352] + - - [256, 256, 64, 9] + - [270, 2106.51] + - - [3, 64, 64, 11552] + - [273, 785.127] + - - [128, 256, 36, 12544] + - [288, 8792.13] + - - [128, 128, 36, 3136] + - [249, 8098.46] + - - [256, 512, 36, 3136] + - [232, 9694.39] + - - [64, 64, 36, 196] + - [246, 2757.76] + - - [144, 288, 36, 512] + - [280, 7077.89] + - - [256, 24, 64, 32] + - [259, 1483.83] + - - [384, 384, 36, 800] + - [221, 9246.5] + - - [512, 512, 64, 1600] + - [267, 10277.3] + - - [112, 224, 36, 512] + - [235, 6744.78] + - - [128, 128, 36, 49] + - [241, 2716.29] + - - [512, 512, 36, 4] + - [269, 1156.52] + - - [35, 96, 64, 4235] + - [230, 4631.28] + - - [192, 384, 64, 450] + - [221, 9372.2] + - - [256, 256, 36, 1024] + - [292, 9346.64] + - - [112, 224, 64, 1152] + - [235, 7523.95] + - - [256, 512, 64, 400] + - [254, 9597.95] + - - [149, 32, 36, 19072] + - [299, 5811.8] + - - [128, 256, 36, 6272] + - [235, 8754.68] + - - [128, 192, 36, 1568] + - [260, 8195.1] + - - [256, 256, 36, 512] + - [292, 9074.22] + - - [256, 256, 64, 112] + - [292, 8305.55] + - - [512, 512, 64, 18] + - [285, 4324.02] + - - [256, 256, 64, 18] + - [248, 3547.81] + - - [256, 256, 64, 1568] + - [257, 10141.7] + - - [64, 96, 36, 1568] + - [278, 6805.66] + - - [384, 256, 36, 4096] + - [292, 9311.1] + - - [256, 512, 64, 800] + - [267, 9998.35] + - - [256, 384, 36, 2048] + - [292, 9285.34] + - - [3, 64, 36, 200704] + - [274, 547.375] + - - [384, 384, 64, 2304] + - [215, 9901.68] + - - [160, 320, 64, 128] + - [251, 7113.81] + - - [512, 512, 36, 528] + - [221, 9567.65] + - - [160, 320, 36, 128] + - [252, 6411.13] + - - [96, 96, 64, 800] + - [260, 7690.01] + - - [256, 512, 36, 49] + - [248, 6721.25] + - - [384, 384, 64, 450] + - [221, 9523.53] + - - [3, 64, 64, 23104] + - [273, 801.621] + - - [256, 256, 64, 3200] + - [257, 10300.4] + - - [128, 192, 36, 512] + - [235, 7499.75] + - - [192, 192, 64, 288] + - [292, 8774.24] + - - [96, 208, 64, 242] + - [252, 5901.99] + - - [256, 16, 36, 3200] + - [281, 3807.77] + - - [512, 512, 64, 8] + - [259, 2379.75] + - - [64, 128, 64, 5776] + - [232, 9332.74] + - - [512, 512, 64, 288] + - [221, 9521.99] + - - [256, 16, 36, 32] + - [277, 766.005] + - - [128, 192, 64, 288] + - [292, 8527.58] + - - [32, 64, 64, 640] + - [260, 4660.34] + - - [64, 64, 36, 392] + - [260, 3686.4] + - - [384, 384, 36, 1024] + - [226, 9282.48] + - - [64, 64, 36, 11552] + - [290, 5904.78] + - - [96, 128, 36, 6272] + - [280, 8350.99] + - - [128, 256, 36, 16] + - [262, 2144.81] + - - [256, 256, 64, 288] + - [292, 9140.13] + - - [64, 64, 64, 1652] + - [278, 7766.53] + - - [256, 384, 36, 1024] + - [226, 9203.27] + - - [96, 128, 64, 3200] + - [295, 8866.2] + - - [256, 324, 36, 3200] + - [234, 8194.25] + - - [128, 192, 64, 800] + - [292, 9198.03] + - - [64, 128, 64, 10] + - [233, 851.117] + - - [96, 208, 64, 288] + - [260, 6667.58] + - - [64, 96, 36, 2592] + - [242, 7216.88] + - - [64, 128, 64, 160] + - [271, 5190.97] + - - [192, 384, 64, 512] + - [221, 9446.04] + - - [64, 64, 36, 6272] + - [230, 6212.01] + - - [512, 24, 36, 288] + - [227, 3922.47] + - - [128, 128, 64, 1568] + - [221, 9037.86] + - - [112, 224, 64, 242] + - [291, 6399.26] + - - [128, 256, 64, 1600] + - [257, 10010.3] + - - [32, 32, 64, 20000] + - [225, 4378.41] + - - [160, 192, 64, 288] + - [252, 7803.63] + - - [512, 24, 64, 128] + - [213, 3733.8] + - - [512, 512, 36, 32] + - [248, 5935.34] + - - [3, 64, 36, 100352] + - [245, 542.783] + - - [3, 64, 64, 1444] + - [274, 674.159] + - - [512, 512, 36, 3136] + - [215, 9921.1] + - - [128, 256, 64, 6400] + - [275, 10349.3] + - - [256, 256, 36, 2048] + - [292, 9518.99] + - - [128, 160, 64, 288] + - [235, 7549.75] + - - [256, 256, 64, 6400] + - [257, 10392.6] + - - [32, 64, 64, 20000] + - [283, 6493.86] + - - [256, 256, 36, 1680] + - [232, 9513.29] + - - [128, 128, 64, 210] + - [292, 7094.1] + - - [192, 384, 36, 2048] + - [221, 8818.65] + - - [256, 256, 64, 144] + - [292, 8608.61] + - - [384, 384, 36, 4096] + - [226, 9356.94] + - - [160, 320, 64, 1152] + - [252, 8749.48] + - - [384, 256, 36, 2048] + - [292, 9279.63] + - - [256, 512, 36, 392] + - [292, 9252.14] + - - [256, 512, 64, 50] + - [248, 7511.29] + - - [73, 192, 36, 23360] + - [296, 5802.93] + - - [3, 64, 36, 50176] + - [245, 542.037] + - - [384, 384, 36, 2048] + - [221, 9325.8] + - - [256, 384, 64, 450] + - [267, 9528.66] + - - [192, 320, 64, 128] + - [226, 8399.81] + - - [128, 256, 36, 32] + - [241, 3276.8] + - - [160, 192, 36, 512] + - [280, 7752.34] + - - [512, 512, 64, 256] + - [232, 9473.64] + - - [256, 512, 64, 32] + - [270, 6391.32] + - - [384, 384, 64, 576] + - [221, 9614.79] + - - [64, 64, 64, 648] + - [278, 6282.15] + - - [512, 486, 36, 288] + - [292, 8624.93] + - - [32, 64, 36, 1440] + - [230, 3961.5] + - - [144, 288, 64, 242] + - [252, 6347.02] + - - [384, 256, 64, 576] + - [257, 9775.24] + - - [512, 512, 36, 64] + - [228, 7791.28] + - - [448, 384, 64, 128] + - [221, 9132.23] + - - [64, 128, 64, 722] + - [271, 8047.11] + - - [144, 288, 64, 288] + - [280, 6859.4] + - - [512, 512, 64, 224] + - [292, 9427.29] + - - [112, 224, 64, 288] + - [291, 6736.92] + - - [384, 384, 64, 1152] + - [215, 9820.46] + - - [448, 384, 36, 128] + - [292, 8761.31] + - - [64, 64, 64, 100] + - [238, 2708.1] + - - [256, 486, 36, 128] + - [264, 7640.04] + - - [64, 96, 64, 4608] + - [293, 8351.49] + - - [16, 32, 64, 160] + - [217, 736.36] + - - [64, 192, 36, 6272] + - [293, 8041.19] + - - [64, 64, 64, 200] + - [246, 3924.31] + - - [256, 256, 36, 800] + - [292, 9299.55] + - - [64, 128, 36, 6272] + - [290, 6816.36] + - - [32, 64, 64, 40] + - [237, 885.622] + - - [256, 16, 64, 32] + - [287, 1205.26] + - - [192, 384, 36, 800] + - [226, 8673.88] + - - [128, 128, 36, 3200] + - [260, 8538.89] + - - [256, 256, 36, 256] + - [232, 8454.36] + - - [192, 384, 64, 1152] + - [221, 9589.01] + - - [128, 256, 64, 200] + - [231, 8141.12] + - - [64, 96, 64, 1152] + - [260, 7620.88] + - - [128, 128, 36, 392] + - [235, 6175.51] + - - [80, 192, 36, 10368] + - [283, 6497.16] + - - [224, 224, 36, 128] + - [293, 5826.89] + - - [512, 512, 64, 28] + - [248, 5728.81] + - - [256, 16, 64, 1568] + - [263, 4637.2] + - - [144, 288, 64, 1152] + - [280, 7784.24] + - - [256, 256, 64, 576] + - [257, 9596.12] + - - [64, 128, 36, 784] + - [293, 6058.99] + - - [256, 24, 36, 128] + - [227, 2239.84] + - - [256, 256, 64, 2304] + - [257, 10225.7] + - - [192, 384, 36, 512] + - [292, 8549.03] + - - [16, 32, 64, 2560] + - [245, 2153.13] + - - [256, 512, 36, 32] + - [270, 5702.23] + - - [512, 512, 64, 128] + - [292, 9084.11] + - - [128, 128, 64, 200] + - [229, 6971.91] + - - [512, 512, 64, 32] + - [241, 6248.5] + - - [128, 256, 36, 196] + - [241, 6628.76] + - - [8, 384, 64, 6600] + - [273, 2733.89] + - - [149, 32, 64, 8195] + - [235, 6050.91] + - - [35, 96, 64, 6160] + - [280, 4689.35] + - - [64, 64, 36, 1760] + - [230, 5622.24] - null diff --git a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bljk_SB.yaml index 2cf135588..32375b607 100644 --- a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bljk_SB.yaml @@ -32091,8 +32091,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32255,8 +32255,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32419,8 +32419,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32583,8 +32583,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32747,8 +32747,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32911,8 +32911,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33075,8 +33075,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33239,8 +33239,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33399,8 +33399,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33563,8 +33563,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33723,8 +33723,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33887,8 +33887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34051,8 +34051,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34215,8 +34215,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34379,8 +34379,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34543,8 +34543,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34707,8 +34707,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34871,8 +34871,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35035,8 +35035,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35199,8 +35199,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35363,8 +35363,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35527,8 +35527,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35691,8 +35691,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35855,8 +35855,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36019,8 +36019,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36186,8 +36186,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36349,8 +36349,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36516,8 +36516,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36679,8 +36679,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36846,8 +36846,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37009,8 +37009,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37176,8 +37176,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37339,8 +37339,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37506,8 +37506,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37667,8 +37667,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37828,8 +37828,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37991,8 +37991,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38158,8 +38158,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38323,8 +38323,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38486,8 +38486,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38653,8 +38653,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38816,8 +38816,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38983,8 +38983,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39146,8 +39146,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39309,8 +39309,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39474,8 +39474,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39637,8 +39637,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39800,8 +39800,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39965,8 +39965,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40128,8 +40128,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40291,8 +40291,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40452,8 +40452,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40613,8 +40613,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40774,8 +40774,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40935,8 +40935,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41100,8 +41100,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41263,8 +41263,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41430,8 +41430,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41593,8 +41593,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41756,8 +41756,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41915,8 +41915,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42078,8 +42078,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42239,8 +42239,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42404,8 +42404,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42565,8 +42565,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42726,8 +42726,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42887,8 +42887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43052,8 +43052,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43213,8 +43213,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43374,8 +43374,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43535,8 +43535,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43696,8 +43696,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43857,8 +43857,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44018,8 +44018,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44179,8 +44179,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44340,8 +44340,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44501,8 +44501,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44662,8 +44662,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44823,8 +44823,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44984,8 +44984,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45145,8 +45145,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45306,8 +45306,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45467,8 +45467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45628,8 +45628,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45787,8 +45787,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45947,8 +45947,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46107,8 +46107,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46267,8 +46267,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46427,8 +46427,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46587,8 +46587,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46747,8 +46747,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46911,8 +46911,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47071,8 +47071,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47231,8 +47231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47391,8 +47391,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47551,8 +47551,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47711,8 +47711,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47871,8 +47871,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48035,8 +48035,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48195,8 +48195,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48359,8 +48359,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48519,8 +48519,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48683,8 +48683,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48843,8 +48843,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49003,8 +49003,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49163,8 +49163,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49323,8 +49323,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49483,8 +49483,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49647,8 +49647,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49811,8 +49811,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49975,8 +49975,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50135,8 +50135,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50299,8 +50299,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50463,8 +50463,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50623,8 +50623,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50787,8 +50787,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50951,8 +50951,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51111,8 +51111,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51275,8 +51275,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51439,8 +51439,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51603,8 +51603,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51763,8 +51763,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51927,8 +51927,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52087,8 +52087,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52251,8 +52251,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52415,8 +52415,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52579,8 +52579,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52739,8 +52739,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52903,8 +52903,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53067,8 +53067,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53231,8 +53231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53395,8 +53395,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53559,8 +53559,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53723,8 +53723,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53887,8 +53887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54051,8 +54051,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54215,8 +54215,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54375,8 +54375,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54539,8 +54539,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54703,8 +54703,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54867,8 +54867,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55031,8 +55031,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55195,8 +55195,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55359,8 +55359,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55519,8 +55519,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55679,8 +55679,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55839,8 +55839,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55999,8 +55999,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56159,8 +56159,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56319,8 +56319,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56479,8 +56479,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56639,8 +56639,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56799,8 +56799,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56959,8 +56959,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57119,8 +57119,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57279,8 +57279,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57443,8 +57443,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57607,8 +57607,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57767,8 +57767,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57931,8 +57931,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58095,8 +58095,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58259,8 +58259,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58419,8 +58419,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58583,8 +58583,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58743,8 +58743,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58907,8 +58907,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59071,8 +59071,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59231,8 +59231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59395,8 +59395,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59559,8 +59559,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59723,8 +59723,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59887,8 +59887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60051,8 +60051,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60215,8 +60215,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60379,8 +60379,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60543,8 +60543,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60707,8 +60707,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60871,8 +60871,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61035,8 +61035,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61199,8 +61199,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61363,8 +61363,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61527,8 +61527,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61691,8 +61691,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61855,8 +61855,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62015,8 +62015,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62179,8 +62179,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62343,8 +62343,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62507,8 +62507,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62671,8 +62671,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62831,8 +62831,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62991,8 +62991,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63155,8 +63155,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63319,8 +63319,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63483,8 +63483,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63647,8 +63647,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63807,8 +63807,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63971,8 +63971,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64135,8 +64135,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64295,8 +64295,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64459,8 +64459,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64619,8 +64619,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64783,8 +64783,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64943,8 +64943,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65103,8 +65103,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65267,8 +65267,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65431,8 +65431,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65595,8 +65595,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65759,8 +65759,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65923,8 +65923,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66087,8 +66087,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66251,8 +66251,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66411,8 +66411,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66575,8 +66575,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66739,8 +66739,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66903,8 +66903,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67067,8 +67067,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67231,8 +67231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67395,8 +67395,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67559,8 +67559,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67723,8 +67723,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67887,8 +67887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68051,8 +68051,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68215,8 +68215,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68379,8 +68379,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68543,8 +68543,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68707,8 +68707,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68871,8 +68871,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69035,8 +69035,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69199,8 +69199,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69359,8 +69359,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69519,8 +69519,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69683,8 +69683,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69843,8 +69843,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70003,8 +70003,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70167,8 +70167,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70327,8 +70327,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70491,8 +70491,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70651,8 +70651,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70811,8 +70811,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70975,8 +70975,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71139,8 +71139,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71303,8 +71303,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71467,8 +71467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71631,8 +71631,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71795,8 +71795,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71959,8 +71959,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72123,8 +72123,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72287,8 +72287,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72451,8 +72451,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72611,8 +72611,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72775,8 +72775,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72939,8 +72939,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73103,8 +73103,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73267,8 +73267,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73431,8 +73431,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73595,8 +73595,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73759,8 +73759,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73923,8 +73923,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74087,8 +74087,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74251,8 +74251,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74415,8 +74415,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74579,8 +74579,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74739,8 +74739,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74903,8 +74903,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75067,8 +75067,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75231,8 +75231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75395,8 +75395,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75555,8 +75555,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75715,8 +75715,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75879,8 +75879,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76043,8 +76043,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76203,8 +76203,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76363,8 +76363,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76523,8 +76523,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76683,8 +76683,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76843,8 +76843,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77003,8 +77003,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77163,8 +77163,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77327,8 +77327,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77487,8 +77487,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77647,8 +77647,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77811,8 +77811,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77975,8 +77975,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78139,8 +78139,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78303,8 +78303,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78467,8 +78467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78631,8 +78631,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78795,8 +78795,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78959,8 +78959,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79123,8 +79123,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79292,8 +79292,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79457,8 +79457,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79624,8 +79624,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79791,8 +79791,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79958,8 +79958,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80125,8 +80125,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80294,8 +80294,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80459,8 +80459,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80628,8 +80628,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80795,8 +80795,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80962,8 +80962,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81129,8 +81129,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81296,8 +81296,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81463,8 +81463,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81630,8 +81630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81795,8 +81795,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81962,8 +81962,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82129,8 +82129,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82296,8 +82296,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82463,8 +82463,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82630,8 +82630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82797,8 +82797,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82966,8 +82966,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83133,8 +83133,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83300,8 +83300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83467,8 +83467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83634,8 +83634,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83801,8 +83801,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83968,8 +83968,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84135,8 +84135,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84300,8 +84300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84467,8 +84467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84632,8 +84632,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84799,8 +84799,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84964,8 +84964,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85131,8 +85131,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85300,8 +85300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85467,8 +85467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85634,8 +85634,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85801,8 +85801,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85966,8 +85966,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86133,8 +86133,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86300,8 +86300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86469,8 +86469,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86636,8 +86636,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86803,8 +86803,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86968,8 +86968,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87135,9 +87135,10233 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 552 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6720 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2112 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 553 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR0_TT4_4_USFGRO0_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 554 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 555 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO1_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 556 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 557 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO0_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 558 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 559 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 560 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 561 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 562 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 563 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 564 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 565 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 566 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 567 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 568 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 569 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 570 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 571 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 572 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 512 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 573 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 784 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 574 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR0_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 575 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2080 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 576 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2112 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 577 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4224 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 578 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2112 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 579 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 580 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 581 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 582 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 583 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW4_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 584 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 585 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 586 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 587 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 588 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 589 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 590 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 512 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 591 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1544 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 592 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB1_PGR1_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 520 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 593 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1040 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 594 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 595 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR0_TT2_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 596 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 597 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_AMAS1_EPS0_FL0_GRVW1_GSU8_LPB1_PGR0_PLR1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 598 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_AMAS1_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 599 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 600 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 601 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 602 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 603 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 604 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 605 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 606 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 607 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 608 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 609 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 610 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 611 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 612 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -87154,31 +97378,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 552 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 613 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -87192,16 +97416,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -87212,34 +97436,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: true - LdsNumElements: 6720 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -87247,26 +97471,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87304,9 +97529,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -87323,31 +97549,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 553 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR0_TT4_4_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 614 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87359,54 +97587,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: true - LdsNumElements: 3392 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -87414,26 +97642,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87445,7 +97674,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -87471,9 +97700,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -87490,31 +97720,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 554 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 615 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87526,54 +97758,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 64 LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: true - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -87581,26 +97813,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87612,7 +97845,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -87638,9 +97871,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -87657,31 +97891,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 555 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO1_WG16_4_4_WGM8 + SolutionIndex: 616 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87693,54 +97929,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 64 LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: true - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -87748,26 +97984,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87805,9 +98042,183 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 617 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -87824,8 +98235,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 556 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_WG16_4_4_WGM8 + SolutionIndex: 618 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87834,21 +98245,21 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87868,7 +98279,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -87876,34 +98287,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: true - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87915,18 +98326,20 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 NumLoadsB: 1 @@ -87935,6 +98348,7 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87946,7 +98360,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -87972,9 +98386,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -87991,8 +98406,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 557 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 619 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88001,17 +98416,17 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -88027,16 +98442,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -88054,21 +98469,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: true - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -88081,7 +98496,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 16 MacroTileA: 64 @@ -88089,19 +98504,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88139,9 +98555,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -88158,8 +98575,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 558 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 620 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88182,7 +98599,9 @@ WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [704, 1024, 1, 128] - [102, 3019.46] @@ -90662,8 +101081,6 @@ - [212, 8995.84] - - [4096, 512, 1, 2048] - [207, 9298.08] - - - [512, 256, 1, 2048] - - [200, 5186.16] - - [4096, 1024, 1, 2048] - [189, 9790.67] - - [2048, 1024, 1, 2048] @@ -94152,4 +104569,270 @@ - [524, 10427.3] - - [1024, 1, 1, 13] - [537, 0.0] + - - [768, 512, 1, 768] + - [561, 5889.04] + - - [768, 2048, 1, 3072] + - [571, 9394.62] + - - [768, 32, 1, 768] + - [583, 1502.74] + - - [64, 128, 96, 128] + - [578, 4973.48] + - - [3072, 1024, 1, 768] + - [572, 9856.07] + - - [768, 1024, 1, 3072] + - [565, 8611.06] + - - [768, 512, 1, 3072] + - [564, 6430.79] + - - [768, 64, 1, 768] + - [585, 2621.44] + - - [768, 4096, 1, 3072] + - [570, 10030.4] + - - [768, 2048, 1, 2] + - [563, 381.763] + - - [768, 2048, 1, 768] + - [568, 9754.2] + - - [768, 320, 1, 30522] + - [581, 8529.4] + - - [64, 64, 96, 64] + - [575, 2496.61] + - - [768, 640, 1, 30522] + - [562, 8253.84] + - - [768, 1280, 1, 30522] + - [567, 9572.85] + - - [768, 1280, 1, 768] + - [571, 8713.93] + - - [768, 640, 1, 768] + - [561, 7293.03] + - - [768, 32, 1, 2] + - [573, 11.8154] + - - [3072, 2048, 1, 768] + - [568, 10019.6] + - - [768, 4096, 1, 768] + - [568, 9927.35] + - - [3072, 4096, 1, 768] + - [571, 10150.1] + - - [64, 256, 192, 256] + - [577, 7054.19] + - - [768, 8, 1, 768] + - [584, 340.939] + - - [64, 128, 384, 128] + - [576, 6765.01] + - - [768, 1024, 1, 768] + - [566, 8768.58] + - - [768, 320, 1, 768] + - [582, 6838.54] + - - [64, 64, 768, 64] + - [579, 5388.83] + - - [768, 1024, 1, 2] + - [559, 258.695] + - - [768, 16, 1, 768] + - [584, 819.2] + - - [64, 256, 96, 256] + - [577, 5893.64] + - - [3072, 512, 1, 768] + - [569, 9722.79] + - - [768, 160, 1, 768] + - [586, 5019.78] + - - [768, 4096, 1, 2] + - [560, 507.375] + - - [1600, 512, 1, 1024] + - [590, 7186.95] + - - [1024, 512, 1, 64] + - [588, 2557.5] + - - [1024, 512, 1, 1] + - [587, 71.2348] + - - [2048, 512, 1, 1] + - [589, 90.3945] + - - [1024, 200, 1, 1] + - [595, 40.0] + - - [32, 200, 1, 1] + - [591, 1.56863] + - - [560, 200, 1, 1024] + - [599, 4731.35] + - - [1, 512, 1, 1] + - [598, 0.130612] + - - [64, 512, 1, 1] + - [593, 7.58519] + - - [1024, 8192, 1, 256] + - [608, 9518.99] + - - [1024, 22016, 1, 256] + - [614, 9881.12] + - - [256, 8976, 1, 4352] + - [606, 9567.08] + - - [512, 256, 1, 2048] + - [619, 5917.89] + - - [1024, 19968, 1, 256] + - [614, 9882.37] + - - [256, 8976, 1, 1536] + - [604, 8437.35] + - - [256, 8976, 1, 33536] + - [604, 8441.89] + - - [1024, 1792, 1, 256] + - [604, 7756.97] + - - [1024, 21504, 1, 256] + - [614, 9893.9] + - - [512, 215, 1, 2048] + - [620, 4665.64] + - - [1024, 7168, 1, 256] + - [608, 9509.35] + - - [256, 8976, 1, 15872] + - [610, 8914.65] + - - [1024, 19712, 1, 256] + - [614, 9771.9] + - - [256, 8976, 1, 5632] + - [610, 8740.03] + - - [1024, 14848, 1, 256] + - [614, 9756.15] + - - [1024, 28672, 1, 256] + - [614, 9958.92] + - - [256, 8976, 1, 9728] + - [617, 8853.04] + - - [1024, 17152, 1, 256] + - [608, 9737.3] + - - [256, 8976, 1, 11520] + - [610, 8999.2] + - - [256, 8976, 1, 8192] + - [600, 7897.32] + - - [1024, 3328, 1, 256] + - [615, 8593.53] + - - [256, 8976, 1, 7424] + - [610, 8980.47] + - - [1024, 18944, 1, 256] + - [614, 9854.85] + - - [1024, 10496, 1, 256] + - [609, 9453.9] + - - [256, 8976, 1, 5376] + - [607, 9608.37] + - - [256, 8976, 1, 6144] + - [604, 7880.13] + - - [1024, 40448, 1, 256] + - [614, 10016.6] + - - [256, 8976, 1, 22016] + - [617, 8939.87] + - - [256, 8976, 1, 4864] + - [605, 9211.43] + - - [256, 8976, 1, 12288] + - [601, 8065.05] + - - [1024, 9728, 1, 256] + - [614, 9636.25] + - - [256, 8976, 1, 2048] + - [602, 7001.33] + - - [1024, 10240, 1, 256] + - [608, 9619.96] + - - [256, 8976, 1, 2304] + - [606, 9509.74] + - - [1024, 7936, 1, 256] + - [614, 9300.67] + - - [768, 256, 1, 2048] + - [618, 6267.95] + - - [1024, 9984, 1, 256] + - [614, 9477.28] + - - [1024, 13312, 1, 256] + - [614, 9758.56] + - - [1024, 16128, 1, 256] + - [608, 9721.9] + - - [1024, 8960, 1, 256] + - [609, 9398.25] + - - [1024, 5120, 1, 256] + - [615, 9315.5] + - - [1024, 11264, 1, 256] + - [608, 9664.8] + - - [256, 8976, 1, 20480] + - [616, 8279.87] + - - [1024, 20992, 1, 256] + - [608, 9878.87] + - - [256, 8976, 1, 9472] + - [610, 8990.96] + - - [256, 8976, 1, 8448] + - [610, 8983.52] + - - [256, 8976, 1, 20992] + - [611, 8942.11] + - - [256, 8976, 1, 10496] + - [611, 8989.71] + - - [1024, 15104, 1, 256] + - [609, 9676.01] + - - [1024, 6400, 1, 256] + - [617, 9145.89] + - - [1024, 4096, 1, 256] + - [610, 9124.25] + - - [256, 8976, 1, 2560] + - [604, 8566.11] + - - [256, 8976, 1, 2816] + - [606, 9496.84] + - - [1024, 7680, 1, 256] + - [614, 9460.84] + - - [256, 8976, 1, 14336] + - [611, 8226.8] + - - [256, 8976, 1, 6656] + - [611, 8771.42] + - - [1024, 3072, 1, 256] + - [611, 9076.94] + - - [256, 8976, 1, 5888] + - [607, 9546.3] + - - [1024, 12288, 1, 256] + - [608, 9690.81] + - - [256, 8976, 1, 26112] + - [613, 8699.83] + - - [1024, 7424, 1, 256] + - [615, 9256.84] + - - [256, 8976, 1, 14848] + - [616, 8885.79] + - - [768, 215, 1, 2048] + - [618, 5628.59] + - - [1024, 2560, 1, 256] + - [611, 8820.83] + - - [256, 8976, 1, 19968] + - [610, 8928.86] + - - [256, 8976, 1, 9984] + - [610, 8993.12] + - - [1024, 4864, 1, 256] + - [611, 8974.3] + - - [1024, 33536, 1, 256] + - [614, 9943.07] + - - [256, 8976, 1, 15104] + - [611, 8996.63] + - - [1024, 2048, 1, 256] + - [609, 8462.66] + - - [256, 8976, 1, 8960] + - [611, 8998.92] + - - [1024, 6144, 1, 256] + - [616, 9359.67] + - - [1024, 14592, 1, 256] + - [614, 9667.42] + - - [256, 8976, 1, 19712] + - [610, 9020.11] + - - [1024, 11520, 1, 256] + - [609, 9527.7] + - - [1024, 5632, 1, 256] + - [608, 9297.2] + - - [256, 8976, 1, 11008] + - [617, 8994.8] + - - [256, 8976, 1, 17152] + - [611, 9003.8] + - - [256, 8976, 1, 3072] + - [600, 8261.96] + - - [1024, 3840, 1, 256] + - [617, 8671.89] + - - [1024, 14336, 1, 256] + - [614, 9760.28] + - - [1024, 20480, 1, 256] + - [608, 9887.85] + - - [1024, 23552, 1, 256] + - [608, 9890.46] + - - [256, 8976, 1, 7168] + - [603, 8478.34] + - - [1024, 13568, 1, 256] + - [608, 9654.64] + - - [1024, 4608, 1, 256] + - [616, 9218.25] + - - [256, 8976, 1, 10240] + - [601, 8076.16] + - - [1024, 8704, 1, 256] + - [610, 9475.5] + - - [1024, 11008, 1, 256] + - [614, 9524.96] + - - [1024, 8448, 1, 256] + - [608, 9352.16] + - - [256, 8976, 1, 44505] + - [612, 8430.23] - null diff --git a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Alik_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Alik_Bljk_SB.yaml index 9b2ce9b0f..84d40bfca 100644 --- a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Alik_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Alik_Bljk_SB.yaml @@ -39633,8 +39633,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -39797,8 +39797,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -39961,8 +39961,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40125,8 +40125,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40289,8 +40289,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40453,8 +40453,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40617,8 +40617,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40781,8 +40781,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40945,8 +40945,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41109,8 +41109,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41273,8 +41273,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41437,8 +41437,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41601,8 +41601,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41765,8 +41765,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41925,8 +41925,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42089,8 +42089,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42253,8 +42253,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42417,8 +42417,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42581,8 +42581,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42745,8 +42745,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42909,8 +42909,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43073,8 +43073,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43237,8 +43237,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43401,8 +43401,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43566,8 +43566,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43733,8 +43733,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43898,8 +43898,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44061,8 +44061,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44226,8 +44226,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44393,8 +44393,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44558,8 +44558,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44721,8 +44721,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44886,8 +44886,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45053,8 +45053,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45218,8 +45218,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45381,8 +45381,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45546,8 +45546,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45713,8 +45713,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45878,8 +45878,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46041,8 +46041,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46206,8 +46206,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46371,8 +46371,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46538,8 +46538,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46703,8 +46703,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46868,8 +46868,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47033,8 +47033,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47198,8 +47198,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47361,8 +47361,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47526,8 +47526,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47693,8 +47693,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47858,8 +47858,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48021,8 +48021,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48186,8 +48186,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48353,8 +48353,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48518,8 +48518,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48681,8 +48681,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48848,8 +48848,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49011,8 +49011,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49178,8 +49178,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49341,8 +49341,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49502,8 +49502,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49665,8 +49665,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49826,8 +49826,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49987,8 +49987,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50146,8 +50146,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50309,8 +50309,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50468,8 +50468,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50631,8 +50631,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50790,8 +50790,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50953,8 +50953,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51112,8 +51112,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51275,8 +51275,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51434,8 +51434,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51597,8 +51597,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51758,8 +51758,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51917,8 +51917,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52080,8 +52080,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52239,8 +52239,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52400,8 +52400,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52561,8 +52561,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52728,8 +52728,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52897,8 +52897,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -53064,8 +53064,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -53229,8 +53229,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -53396,8 +53396,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -53442,92 +53442,12322 @@ _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 -- [2, 3, 0, 1] -- - - [1024, 128, 1, 128] - - [12, 896.219] - - - [4, 704, 1, 1280] - - [49, 328.876] - - - [4, 1856, 1, 3328] - - [59, 501.361] - - - [1856, 448, 1, 3328] - - [104, 5677.91] - - - [2944, 4288, 1, 1280] - - [90, 8412.39] - - - [2368, 64, 1, 3328] - - [40, 4913.92] - - - [1760, 32, 1, 1760] - - [67, 3312.94] - - - [2368, 5888, 1, 256] - - [90, 6489.72] - - - [5888, 1856, 1, 256] - - [102, 7791.88] - - - [128, 64, 1, 256] - - [74, 369.217] - - - [512, 24000, 1, 1536] - - [96, 8827.37] - - - [128, 6784, 1, 3328] - - [96, 6536.99] - - - [5888, 1408, 1, 256] - - [110, 6129.61] - - - [5888, 1856, 1, 3328] - - [96, 7969.17] - - - [5056, 704, 1, 256] - - [96, 6723.82] - - - [2048, 400, 1, 512] - - [102, 4531.44] - - - [5888, 2944, 1, 3328] - - [102, 8608.04] - - - [1856, 4288, 1, 256] - - [102, 6297.54] - - - [1024, 5056, 1, 128] - - [80, 3595.37] - - - [5056, 5056, 1, 3328] - - [96, 8559.16] - - - [1408, 5888, 1, 1280] - - [91, 6797.06] - - - [2368, 448, 1, 128] - - [80, 2814.9] - - - [2368, 6784, 1, 128] - - [84, 4781.98] - - - [1024, 3584, 1, 3328] - - [92, 8402.44] - - - [512, 48000, 1, 2048] - - [96, 8162.23] - - - [128, 448, 1, 1280] - - [67, 2903.49] - - - [256, 4288, 1, 3328] - - [97, 6345.94] - - - [5888, 1408, 1, 1280] - - [96, 8959.45] - - - [704, 1856, 1, 3328] - - [91, 6955.27] - - - [4, 1408, 1, 128] - - [111, 60.0747] - - - [1024, 2368, 1, 256] - - [98, 5927.78] - - - [64, 4, 1, 256] - - [116, 13.2129] - - - [1408, 1856, 1, 1280] - - [94, 8051.58] - - - [1408, 64, 1, 1280] - - [70, 3400.45] - - - [448, 1024, 1, 1280] - - [98, 5729.92] - - - [6144, 24000, 1, 2048] - - [102, 7738.3] - - - [4096, 32, 1, 4096] - - [40, 2381.43] - - - [256, 1408, 1, 3328] - - [98, 4844.78] - - - [5056, 5056, 1, 1280] - - [102, 9090.1] - - - [448, 5056, 1, 256] - - [108, 4961.18] - - - [704, 1856, 1, 1280] - - [94, 6456.44] - - - [128, 5056, 1, 128] - - [23, 2251.02] - - - [2368, 128, 1, 256] + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 341 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 342 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2144 + LdsOffsetA: 0 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 343 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2176 + LdsOffsetA: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 344 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 345 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2176 + LdsOffsetA: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 346 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 347 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 348 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_LPA0_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 349 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 350 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 351 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 352 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x16_SE_EPS1_FL1_GRVW2_LPA0_LPB0_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 353 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 96 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 24 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 354 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 355 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 356 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 357 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 358 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 359 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 360 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 361 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 362 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 363 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 364 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 365 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 366 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 367 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 368 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 369 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 370 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_PGR0_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 371 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 372 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 373 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 374 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 375 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 376 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 377 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 378 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 379 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 380 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 381 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 382 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 383 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 384 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 385 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 386 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 387 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 388 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 389 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 390 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 391 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 392 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 393 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 394 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 395 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 396 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 397 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 398 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 399 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 400 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 401 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 402 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 403 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 404 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 405 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 406 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 407 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 408 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 409 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 410 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 411 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 412 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 +- [2, 3, 0, 1] +- - - [1024, 128, 1, 128] + - [12, 896.219] + - - [4, 704, 1, 1280] + - [49, 328.876] + - - [4, 1856, 1, 3328] + - [59, 501.361] + - - [1856, 448, 1, 3328] + - [104, 5677.91] + - - [2944, 4288, 1, 1280] + - [90, 8412.39] + - - [2368, 64, 1, 3328] + - [40, 4913.92] + - - [1760, 32, 1, 1760] + - [67, 3312.94] + - - [2368, 5888, 1, 256] + - [90, 6489.72] + - - [5888, 1856, 1, 256] + - [102, 7791.88] + - - [128, 64, 1, 256] + - [74, 369.217] + - - [512, 24000, 1, 1536] + - [96, 8827.37] + - - [128, 6784, 1, 3328] + - [96, 6536.99] + - - [5888, 1408, 1, 256] + - [110, 6129.61] + - - [5888, 1856, 1, 3328] + - [96, 7969.17] + - - [5056, 704, 1, 256] + - [96, 6723.82] + - - [2048, 400, 1, 512] + - [102, 4531.44] + - - [5888, 2944, 1, 3328] + - [102, 8608.04] + - - [1856, 4288, 1, 256] + - [102, 6297.54] + - - [1024, 5056, 1, 128] + - [80, 3595.37] + - - [5056, 5056, 1, 3328] + - [96, 8559.16] + - - [1408, 5888, 1, 1280] + - [91, 6797.06] + - - [2368, 448, 1, 128] + - [80, 2814.9] + - - [2368, 6784, 1, 128] + - [84, 4781.98] + - - [1024, 3584, 1, 3328] + - [92, 8402.44] + - - [512, 48000, 1, 2048] + - [96, 8162.23] + - - [128, 448, 1, 1280] + - [67, 2903.49] + - - [256, 4288, 1, 3328] + - [97, 6345.94] + - - [5888, 1408, 1, 1280] + - [96, 8959.45] + - - [704, 1856, 1, 3328] + - [91, 6955.27] + - - [4, 1408, 1, 128] + - [111, 60.0747] + - - [1024, 2368, 1, 256] + - [98, 5927.78] + - - [64, 4, 1, 256] + - [116, 13.2129] + - - [1408, 1856, 1, 1280] + - [94, 8051.58] + - - [1408, 64, 1, 1280] + - [70, 3400.45] + - - [448, 1024, 1, 1280] + - [98, 5729.92] + - - [6144, 24000, 1, 2048] + - [102, 7738.3] + - - [4096, 32, 1, 4096] + - [40, 2381.43] + - - [256, 1408, 1, 3328] + - [98, 4844.78] + - - [5056, 5056, 1, 1280] + - [102, 9090.1] + - - [448, 5056, 1, 256] + - [108, 4961.18] + - - [704, 1856, 1, 1280] + - [94, 6456.44] + - - [128, 5056, 1, 128] + - [23, 2251.02] + - - [2368, 128, 1, 256] - [91, 3403.27] - - [1760, 6400, 1, 1760] - [90, 8959.7] @@ -56365,8 +68595,6 @@ - [231, 6307.6] - - [1024, 512, 1, 4608] - [242, 7953.38] - - - [2048, 256, 1, 768] - - [242, 7059.14] - - [4096, 200, 1, 32] - [191, 2199.19] - - [4096, 200, 1, 3328] @@ -59037,4 +71265,260 @@ - [336, 6145.5] - - [1024, 3712, 1, 1024] - [338, 8933.88] + - - [256, 256, 192, 64] + - [343, 8264.64] + - - [768, 4096, 1, 768] + - [356, 9642.08] + - - [768, 64, 1, 768] + - [353, 1850.43] + - - [768, 1280, 1, 768] + - [356, 8738.13] + - - [30522, 320, 1, 768] + - [357, 9733.59] + - - [128, 128, 96, 64] + - [346, 5470.83] + - - [2, 16, 1, 768] + - [349, 2.47742] + - - [30522, 1280, 1, 768] + - [355, 10127.9] + - - [30522, 640, 1, 768] + - [356, 9987.61] + - - [2, 8, 1, 768] + - [348, 0.96] + - - [768, 4096, 1, 3072] + - [358, 9479.41] + - - [768, 32, 1, 768] + - [352, 880.334] + - - [2, 64, 1, 768] + - [349, 9.99024] + - - [256, 256, 96, 64] + - [343, 7614.47] + - - [64, 64, 768, 64] + - [345, 5354.43] + - - [30522, 160, 1, 768] + - [354, 7740.11] + - - [768, 320, 1, 768] + - [347, 5423.67] + - - [128, 128, 384, 64] + - [344, 7179.98] + - - [768, 16, 1, 768] + - [350, 706.376] + - - [3072, 4096, 1, 768] + - [359, 9961.74] + - - [2048, 512, 1, 100] + - [361, 5180.71] + - - [1024, 200, 1, 560] + - [362, 4061.19] + - - [256, 1280, 1, 1024] + - [369, 4337.44] + - - [256, 44505, 1, 1024] + - [405, 8597.69] + - - [10240, 8976, 1, 256] + - [408, 9471.43] + - - [256, 7168, 1, 1024] + - [399, 6718.56] + - - [8448, 8976, 1, 256] + - [391, 9601.31] + - - [18944, 8976, 1, 256] + - [400, 9666.26] + - - [256, 19200, 1, 1024] + - [376, 7488.94] + - - [5632, 8976, 1, 256] + - [388, 9358.39] + - - [256, 23552, 1, 1024] + - [403, 7980.89] + - - [256, 6656, 1, 1024] + - [403, 6287.22] + - - [256, 14336, 1, 1024] + - [398, 7049.26] + - - [256, 12544, 1, 1024] + - [376, 6728.47] + - - [2048, 684, 1, 768] + - [393, 8479.18] + - - [5376, 8976, 1, 256] + - [388, 9519.51] + - - [256, 5888, 1, 1024] + - [408, 6012.4] + - - [19968, 8976, 1, 256] + - [400, 9684.67] + - - [3840, 8976, 1, 256] + - [385, 9461.89] + - - [4608, 8976, 1, 256] + - [385, 9305.82] + - - [256, 684, 1, 1024] + - [411, 3513.06] + - - [256, 22016, 1, 1024] + - [376, 7643.79] + - - [256, 23296, 1, 1024] + - [405, 8048.12] + - - [4864, 8976, 1, 256] + - [383, 9545.62] + - - [256, 7424, 1, 1024] + - [401, 6770.65] + - - [18176, 8976, 1, 256] + - [408, 9729.47] + - - [256, 15104, 1, 1024] + - [397, 7289.08] + - - [8192, 8976, 1, 256] + - [400, 9395.49] + - - [256, 16128, 1, 1024] + - [400, 7461.28] + - - [13312, 8976, 1, 256] + - [408, 9550.97] + - - [256, 21504, 1, 1024] + - [405, 7635.93] + - - [6400, 8976, 1, 256] + - [392, 9560.96] + - - [256, 8960, 1, 1024] + - [367, 6292.36] + - - [1792, 8976, 1, 256] + - [382, 9372.18] + - - [13824, 8976, 1, 256] + - [400, 9585.27] + - - [11776, 8976, 1, 256] + - [400, 9560.34] + - - [256, 20992, 1, 1024] + - [398, 7490.65] + - - [20480, 8976, 1, 256] + - [408, 9610.7] + - - [5888, 8976, 1, 256] + - [379, 9565.2] + - - [256, 10496, 1, 1024] + - [370, 6631.96] + - - [21248, 8976, 1, 256] + - [400, 9755.77] + - - [5120, 8976, 1, 256] + - [408, 9244.59] + - - [7168, 8976, 1, 256] + - [400, 9388.42] + - - [2048, 1536, 1, 768] + - [389, 9446.04] + - - [256, 8192, 1, 1024] + - [394, 6948.89] + - - [4096, 8976, 1, 256] + - [399, 9115.94] + - - [3328, 8976, 1, 256] + - [392, 9434.55] + - - [1280, 8976, 1, 256] + - [390, 9129.8] + - - [2560, 8976, 1, 256] + - [387, 9199.48] + - - [3072, 8976, 1, 256] + - [402, 8963.6] + - - [256, 11776, 1, 1024] + - [380, 6869.8] + - - [18688, 8976, 1, 256] + - [408, 9726.21] + - - [15104, 8976, 1, 256] + - [408, 9715.71] + - - [23552, 8976, 1, 256] + - [400, 9648.42] + - - [6144, 8976, 1, 256] + - [408, 9339.8] + - - [12544, 8976, 1, 256] + - [408, 9654.45] + - - [256, 11264, 1, 1024] + - [381, 6814.98] + - - [2048, 114, 1, 512] + - [412, 4583.5] + - - [4352, 8976, 1, 256] + - [392, 9471.4] + - - [15360, 8976, 1, 256] + - [408, 9583.77] + - - [256, 31488, 1, 1024] + - [407, 8438.01] + - - [28672, 8976, 1, 256] + - [400, 9688.85] + - - [256, 18176, 1, 1024] + - [376, 7405.09] + - - [9728, 8976, 1, 256] + - [408, 9524.15] + - - [256, 2816, 1, 1024] + - [372, 5405.66] + - - [256, 18944, 1, 1024] + - [376, 7503.41] + - - [256, 3584, 1, 1024] + - [375, 6107.15] + - - [7936, 8976, 1, 256] + - [388, 9608.31] + - - [19712, 8976, 1, 256] + - [408, 9736.25] + - - [256, 14848, 1, 1024] + - [381, 7163.42] + - - [256, 8448, 1, 1024] + - [381, 6372.56] + - - [256, 6400, 1, 1024] + - [395, 6395.71] + - - [256, 6144, 1, 1024] + - [406, 6490.22] + - - [9472, 8976, 1, 256] + - [385, 9609.92] + - - [256, 9984, 1, 1024] + - [368, 6484.75] + - - [684, 8976, 1, 256] + - [377, 8128.53] + - - [20992, 8976, 1, 256] + - [400, 9689.65] + - - [2048, 684, 1, 512] + - [384, 7241.78] + - - [2048, 114, 1, 768] + - [410, 4872.46] + - - [8960, 8976, 1, 256] + - [383, 9603.35] + - - [2048, 1536, 1, 512] + - [386, 8830.11] + - - [256, 3328, 1, 1024] + - [374, 5612.55] + - - [33536, 8976, 1, 256] + - [400, 9797.71] + - - [2048, 8976, 1, 256] + - [400, 8975.46] + - - [10496, 8976, 1, 256] + - [391, 9654.43] + - - [256, 5376, 1, 1024] + - [409, 5626.34] + - - [256, 21248, 1, 1024] + - [378, 7525.45] + - - [256, 13312, 1, 1024] + - [376, 6767.11] + - - [16128, 8976, 1, 256] + - [400, 9715.57] + - - [2304, 8976, 1, 256] + - [373, 9433.83] + - - [256, 4864, 1, 1024] + - [363, 5743.55] + - - [17152, 8976, 1, 256] + - [408, 9708.94] + - - [15872, 8976, 1, 256] + - [408, 9657.57] + - - [9984, 8976, 1, 256] + - [385, 9639.74] + - - [256, 14592, 1, 1024] + - [397, 7223.92] + - - [256, 33536, 1, 1024] + - [404, 8147.31] + - - [11264, 8976, 1, 256] + - [400, 9509.96] + - - [31488, 8976, 1, 256] + - [408, 9799.31] + - - [256, 20480, 1, 1024] + - [381, 7498.2] + - - [44505, 8976, 1, 256] + - [392, 9804.78] + - - [13568, 8976, 1, 256] + - [400, 9680.24] + - - [256, 11520, 1, 1024] + - [380, 6805.26] + - - [256, 7936, 1, 1024] + - [396, 6971.77] + - - [2048, 256, 1, 768] + - [366, 7129.13] + - - [256, 4608, 1, 1024] + - [364, 5462.91] + - - [256, 2304, 1, 1024] + - [371, 4842.69] + - - [256, 2560, 1, 1024] + - [372, 5309.25] + - - [2816, 8976, 1, 256] + - [383, 9409.56] - null diff --git a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bjlk_SB.yaml index 6d020d6fa..b440f65ce 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bjlk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bjlk_SB.yaml @@ -16658,8 +16658,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -16822,8 +16822,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -16982,8 +16982,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17146,8 +17146,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17306,8 +17306,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17470,8 +17470,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17630,8 +17630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17790,8 +17790,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17950,8 +17950,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18114,8 +18114,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18274,8 +18274,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18434,8 +18434,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18594,8 +18594,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18758,8 +18758,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18925,8 +18925,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19086,8 +19086,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19247,8 +19247,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19412,8 +19412,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19573,8 +19573,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19734,8 +19734,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19895,8 +19895,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20056,8 +20056,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20221,8 +20221,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20386,8 +20386,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20547,8 +20547,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20708,8 +20708,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20869,8 +20869,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21030,8 +21030,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21191,8 +21191,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21352,8 +21352,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21513,8 +21513,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21674,8 +21674,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21835,8 +21835,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21996,8 +21996,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22157,8 +22157,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22322,8 +22322,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22487,8 +22487,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22650,8 +22650,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22817,8 +22817,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22982,8 +22982,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23145,8 +23145,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23312,8 +23312,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23475,8 +23475,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23642,8 +23642,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23807,8 +23807,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23970,8 +23970,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24137,8 +24137,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24300,8 +24300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24467,8 +24467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24630,8 +24630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24797,8 +24797,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24966,8 +24966,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -25133,8 +25133,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -25298,8 +25298,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -25347,11 +25347,11 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -25362,8 +25362,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -25371,31 +25371,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 832 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -25409,10 +25406,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -25420,26 +25417,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -25449,6 +25454,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -25458,6 +25464,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -25472,35 +25479,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 166 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id007 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -25511,40 +25526,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -25558,10 +25570,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -25569,26 +25581,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -25598,6 +25618,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -25607,6 +25628,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -25621,35 +25643,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 167 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -25659,41 +25689,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -25707,10 +25738,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -25718,19 +25749,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -25738,6 +25776,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -25747,6 +25786,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -25756,6 +25796,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -25770,79 +25811,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 168 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -25855,11 +25905,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -25869,17 +25919,24 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -25887,6 +25944,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -25896,6 +25954,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -25905,6 +25964,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -25919,35 +25979,42607 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 169 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW1_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR0_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 235 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW2_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 237 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO1_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 238 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 242 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 243 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 244 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 245 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 246 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 247 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 248 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 249 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 250 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 251 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 252 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 253 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 254 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 255 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 256 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 257 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 258 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 259 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 260 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 261 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 262 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 263 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 264 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 265 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 266 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 267 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 268 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 269 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 270 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 271 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 272 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 273 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 274 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 275 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW4_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 276 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 277 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 278 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 279 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 280 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 281 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 282 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 283 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 284 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 285 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 286 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 287 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 288 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 289 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 290 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 291 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 292 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 293 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 294 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 295 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 296 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_8_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 297 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 298 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 299 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 300 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 301 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 302 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 303 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 304 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 305 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 306 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 307 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 308 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 309 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 310 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 311 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 312 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 313 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 314 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 315 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 316 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 317 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 318 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id007 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 319 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 384 + LdsNumElementsAlignedB: 384 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 384 + LdsOffsetB_Blk: 1408 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 320 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x24_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 321 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 322 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 323 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 324 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 325 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 326 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 327 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 328 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 329 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 330 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 331 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id012 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 332 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id012 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 333 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 334 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 335 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 336 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 337 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 338 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 339 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id012 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 340 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 341 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 342 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 343 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 344 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 345 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 346 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 347 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 348 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 349 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 350 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id012 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 351 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 352 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 353 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 354 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 355 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 356 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 357 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 358 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 359 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 360 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 361 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 362 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 363 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 364 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 365 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL0_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 366 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id017 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 367 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 368 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 369 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id017 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 370 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 371 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 372 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 373 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 374 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 375 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 376 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id024 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 377 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 378 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id026 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 379 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 380 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 381 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 382 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id024 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 383 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 384 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id026 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 385 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 386 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id027 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 2 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 387 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id027 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 388 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 389 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id031 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 390 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 391 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id030 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 392 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 393 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id030 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 394 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 395 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 396 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 397 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id031 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 398 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 399 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 256 + LSPA: 8 + LSPB: 1 + LVCA: 32 + LVCB: 256 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 400 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 256 + LSPA: 8 + LSPB: 1 + LVCA: 32 + LVCB: 256 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 401 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM08 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 256 + LSPA: 8 + LSPB: 1 + LVCA: 32 + LVCB: 256 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 402 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 64 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 403 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 404 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 405 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 406 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 407 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 408 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 409 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 410 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM08 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 411 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM08 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 412 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 413 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 414 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 415 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 416 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 417 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 418 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 419 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 420 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 421 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 422 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 423 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 424 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 425 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 426 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 427 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 428 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 429 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 430 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 431 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 640 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 432 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 640 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 433 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 434 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 435 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 436 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id002 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -25958,8 +68590,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -25967,31 +68599,188 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 437 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 2 LSPB: 8 - LVCA: 16 + LVCA: 64 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26005,9 +68794,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -26016,26 +68805,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26045,6 +68840,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26054,6 +68850,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26068,47 +68865,216 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 170 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 438 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 439 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -26116,31 +69082,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 4 + LSPA: 2 LSPB: 8 - LVCA: 32 + LVCA: 64 LVCB: 16 LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26153,7 +69115,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 16 MacroTileA: 64 @@ -26166,25 +69128,31 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26194,6 +69162,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26203,6 +69172,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26217,79 +69187,83 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 171 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 440 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 + KernelLanguage: Assembly + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 2 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26302,10 +69276,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -26314,26 +69288,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26343,6 +69323,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26352,6 +69333,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26366,79 +69348,83 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 172 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 441 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 LSPA: 4 LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26451,11 +69437,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -26464,25 +69450,31 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26492,6 +69484,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26501,6 +69494,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26515,35 +69509,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 173 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 442 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -26554,40 +69556,36 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26601,9 +69599,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -26614,24 +69612,30 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26641,6 +69645,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26650,6 +69655,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26664,35 +69670,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 174 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 443 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -26703,8 +69717,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -26712,31 +69726,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26750,10 +69760,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -26761,26 +69771,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26790,6 +69806,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26799,6 +69816,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26813,35 +69831,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 175 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 444 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -26852,8 +69878,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -26861,31 +69887,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 32 - LVCB: 16 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 LVPA: 2 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26900,9 +69922,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -26910,26 +69932,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26939,6 +69967,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26948,6 +69977,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26962,47 +69992,55 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 176 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 445 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -27010,31 +70048,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 LVPA: 2 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -27047,11 +70081,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27059,26 +70093,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27088,6 +70128,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27097,6 +70138,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27111,79 +70153,87 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 177 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 446 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -27196,11 +70246,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27208,15 +70258,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -27228,6 +70283,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27237,6 +70293,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27246,6 +70303,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27260,35 +70318,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 178 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 447 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -27298,41 +70364,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 4 - LSPB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 2 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -27346,10 +70412,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27357,15 +70423,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -27377,6 +70448,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27386,6 +70458,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27395,6 +70468,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27409,35 +70483,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 179 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 448 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -27447,8 +70529,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -27465,23 +70547,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -27495,10 +70577,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27508,13 +70590,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -27526,6 +70611,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27535,6 +70621,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27544,6 +70631,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27558,35 +70646,45 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 180 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 449 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -27596,41 +70694,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -27645,9 +70743,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27655,15 +70753,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -27675,6 +70778,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27684,6 +70788,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27693,6 +70798,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27707,35 +70813,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 181 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 450 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -27745,8 +70859,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -27763,23 +70877,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 LSPA: 8 LSPB: 16 - LVCA: 16 - LVCB: 8 + LVCA: 32 + LVCB: 16 LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -27793,10 +70907,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27804,15 +70918,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -27824,6 +70943,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27833,6 +70953,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27842,6 +70963,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27856,35 +70978,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 182 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 451 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id004 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -27894,41 +71024,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 8 - LSPA: 4 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 LSPB: 16 LVCA: 32 - LVCB: 8 + LVCB: 16 LVPA: 2 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -27942,10 +71072,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27953,15 +71083,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -27973,6 +71106,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27982,6 +71116,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27991,6 +71126,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28005,35 +71141,45 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 183 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 452 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -28043,41 +71189,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 16 - LVCB: 4 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 2 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -28091,10 +71237,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28102,15 +71248,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -28122,6 +71273,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28131,6 +71283,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28140,6 +71293,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28154,35 +71308,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 184 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 453 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id007 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -28192,41 +71354,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -28240,10 +71402,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28251,15 +71413,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -28271,6 +71436,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28280,6 +71446,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28289,6 +71456,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28303,79 +71471,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 185 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 454 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 384 - LdsNumElementsAlignedB: 384 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 384 - LdsOffsetB_Blk: 1408 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -28388,11 +71566,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28400,15 +71578,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 3 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -28420,6 +71603,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28429,6 +71613,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28438,6 +71623,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28452,39 +71638,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 186 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x24_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 455 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -28492,56 +71686,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 + LSCA: 64 + LSCB: 128 + LSPA: 16 LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28549,15 +71743,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -28569,6 +71768,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28578,6 +71778,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28587,6 +71788,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28601,96 +71803,104 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 187 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 456 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 + LSCA: 64 + LSCB: 128 + LSPA: 16 LSPB: 8 LVCA: 16 - LVCB: 16 + LVCB: 32 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28698,15 +71908,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -28718,6 +71931,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28727,6 +71941,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28736,6 +71951,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28750,39 +71966,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 188 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 457 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -28790,56 +72016,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 128 + LSCB: 64 LSPA: 8 - LSPB: 8 - LVCA: 16 + LSPB: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28847,15 +72073,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -28867,6 +72098,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28876,6 +72108,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28885,6 +72118,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28899,96 +72133,104 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 189 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 458 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 128 + LSCB: 64 LSPA: 8 - LSPB: 8 - LVCA: 16 + LSPB: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28996,15 +72238,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -29016,6 +72261,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29025,6 +72271,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29034,6 +72281,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29048,33 +72296,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 190 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 459 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -29088,56 +72346,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -29145,15 +72403,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -29165,6 +72428,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29174,6 +72438,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29183,6 +72448,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29197,33 +72463,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 191 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 460 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -29236,57 +72510,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 128 + LSCB: 128 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -29294,15 +72568,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -29314,6 +72591,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29323,6 +72601,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29332,6 +72611,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29346,33 +72626,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 192 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 461 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -29385,7 +72675,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -29395,47 +72685,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -29443,26 +72734,31 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29472,6 +72768,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29481,6 +72778,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29495,14 +72793,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 193 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 462 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -29513,21 +72818,24 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -29535,45 +72843,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 128 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -29582,9 +72891,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -29592,19 +72901,25 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -29612,6 +72927,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29621,6 +72937,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29630,6 +72947,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29644,39 +72962,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 194 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 463 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -29692,48 +73018,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -29741,26 +73068,33 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29770,6 +73104,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29779,6 +73114,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29793,48 +73129,56 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 195 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 464 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -29842,47 +73186,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -29890,19 +73235,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -29910,6 +73259,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29919,6 +73269,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29928,6 +73279,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29942,47 +73294,57 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 196 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 465 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -29990,37 +73352,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -30028,10 +73387,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30039,26 +73398,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30068,6 +73435,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30077,6 +73445,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30091,47 +73460,55 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 197 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 466 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id012 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -30139,37 +73516,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -30178,9 +73552,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30188,26 +73562,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30217,6 +73599,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30226,6 +73609,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30240,39 +73624,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 198 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 467 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id012 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -30280,7 +73672,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -30288,48 +73680,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30337,19 +73730,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -30357,6 +73757,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30366,6 +73767,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30375,6 +73777,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30389,33 +73792,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 199 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 468 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -30429,56 +73840,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30486,19 +73898,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -30506,6 +73925,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30515,6 +73935,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30524,6 +73945,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30538,33 +73960,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 200 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 469 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -30578,56 +74008,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30635,19 +74066,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -30655,6 +74093,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30664,6 +74103,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30673,6 +74113,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30687,39 +74128,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 201 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 470 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -30727,45 +74176,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -30774,9 +74224,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30784,26 +74234,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30813,6 +74271,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30822,6 +74281,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30836,45 +74296,53 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 202 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 471 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -30884,31 +74352,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -30921,11 +74386,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30935,24 +74400,32 @@ NonTemporalC: 0 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30962,6 +74435,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30971,6 +74445,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30985,14 +74460,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 203 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 472 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 @@ -31003,15 +74485,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -31024,9 +74507,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -31034,26 +74517,27 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 8 LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 16 + LVCB: 4 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 1664 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -31072,9 +74556,9 @@ LoopTail: true LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31082,26 +74566,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31111,6 +74601,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31120,6 +74611,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31134,17 +74626,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 204 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 473 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -31152,30 +74651,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -31183,36 +74685,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -31220,10 +74723,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31233,17 +74736,22 @@ NonTemporalC: 0 NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31251,6 +74759,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31260,6 +74769,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31269,6 +74779,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31283,17 +74794,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 205 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 474 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -31301,30 +74819,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id012 - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -31332,47 +74853,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31380,19 +74902,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31400,6 +74927,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31409,6 +74937,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31418,6 +74947,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31432,85 +74962,96 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 206 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 475 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -31518,10 +75059,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31529,19 +75070,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31549,6 +75095,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31558,6 +75105,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31567,6 +75115,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31581,48 +75130,58 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 207 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 476 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -31630,47 +75189,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31678,19 +75238,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31698,6 +75263,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31707,6 +75273,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31716,6 +75283,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31730,39 +75298,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 208 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 477 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -31778,48 +75356,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31827,26 +75406,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31856,6 +75443,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31865,6 +75453,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31879,39 +75468,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 209 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 478 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -31920,7 +75517,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -31928,36 +75525,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -31965,10 +75563,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31976,19 +75574,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31996,6 +75601,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32005,6 +75611,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32014,6 +75621,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32028,46 +75636,54 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 210 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 479 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -32077,22 +75693,23 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 @@ -32106,18 +75723,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -32125,19 +75742,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32145,6 +75767,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32154,6 +75777,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32163,6 +75787,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32177,39 +75802,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 211 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 480 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -32225,23 +75860,24 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -32255,7 +75891,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -32263,10 +75899,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -32274,8 +75910,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -32283,10 +75919,17 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32294,6 +75937,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32303,6 +75947,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32312,6 +75957,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32326,39 +75972,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 212 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 481 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -32366,35 +76020,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -32404,18 +76059,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -32423,19 +76078,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32443,6 +76105,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32452,6 +76115,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32461,6 +76125,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32475,39 +76140,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 213 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 482 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -32515,55 +76188,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -32572,8 +76246,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -32581,10 +76255,17 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32592,6 +76273,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32601,6 +76283,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32610,6 +76293,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32624,14 +76308,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 214 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 483 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -32641,16 +76332,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -32664,39 +76356,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 8 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -32710,9 +76403,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -32721,19 +76414,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32741,6 +76441,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32750,6 +76451,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32759,6 +76461,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32773,33 +76476,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 215 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 484 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id011 + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -32813,55 +76524,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -32870,19 +76582,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32890,6 +76609,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32899,6 +76619,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32908,6 +76629,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32922,33 +76644,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 216 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 485 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id012 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -32962,39 +76692,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -33008,9 +76739,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -33019,19 +76750,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -33039,6 +76777,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33048,6 +76787,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33057,6 +76797,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33071,33 +76812,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 217 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 486 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -33111,35 +76860,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 16 + LSPB: 32 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3328 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -33149,18 +76899,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -33168,19 +76918,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -33188,6 +76945,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33197,6 +76955,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33206,6 +76965,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33220,39 +76980,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 218 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 487 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW1_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 + VectorWidth: 1 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -33261,7 +77029,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -33269,26 +77037,27 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -33298,18 +77067,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -33317,26 +77086,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33346,6 +77123,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33355,6 +77133,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33369,14 +77148,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 219 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 488 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -33387,21 +77173,22 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -33409,45 +77196,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -33455,10 +77243,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -33466,19 +77254,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -33486,6 +77281,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33495,6 +77291,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33504,6 +77301,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33518,39 +77316,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 220 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 489 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -33567,22 +77373,23 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 @@ -33596,7 +77403,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -33604,10 +77411,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -33615,8 +77422,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -33624,10 +77431,17 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -33635,6 +77449,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33644,6 +77459,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33653,6 +77469,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33667,75 +77484,84 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 221 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 490 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -33745,7 +77571,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -33753,37 +77579,43 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33793,6 +77625,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33802,6 +77635,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33816,39 +77650,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 222 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 491 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id011 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -33865,74 +77709,85 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33942,6 +77797,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33951,8 +77807,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -33965,17 +77823,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 223 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 492 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -33983,21 +77848,22 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id011 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -34005,76 +77871,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 32 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34082,6 +77958,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34091,6 +77968,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34100,8 +77978,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -34114,116 +77994,132 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 224 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 493 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34231,6 +78127,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34240,6 +78137,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34249,8 +78147,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -34263,75 +78163,86 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 225 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 494 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -34341,38 +78252,45 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34380,6 +78298,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34389,6 +78308,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34398,8 +78318,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -34412,85 +78334,96 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 226 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 495 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -34498,37 +78431,45 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34538,6 +78479,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34547,8 +78489,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -34561,85 +78505,96 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 227 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 496 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id011 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -34647,30 +78602,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34678,6 +78640,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34687,6 +78650,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34696,8 +78660,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -34710,48 +78676,58 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 228 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 497 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id011 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -34759,36 +78735,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -34796,30 +78773,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34827,6 +78811,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34836,6 +78821,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34845,8 +78831,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -34859,39 +78847,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 229 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 498 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -34908,36 +78906,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -34945,30 +78944,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34976,6 +78984,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34985,6 +78994,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34994,8 +79004,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -35008,35 +79020,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 230 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 499 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -35046,41 +79066,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 LSCB: 128 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 32 - LVCB: 32 - LVPA: 2 + LVCB: 64 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -35094,30 +79115,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35125,6 +79153,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35134,6 +79163,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35143,8 +79173,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -35157,79 +79189,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 231 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL0_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 500 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -35242,31 +79285,38 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35274,6 +79324,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35283,6 +79334,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35292,8 +79344,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -35306,79 +79360,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 232 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 501 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id017 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 128 - LSPA: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 16 + LVCA: 64 LVCB: 32 - LVPA: 4 - LVPB: 2 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -35391,31 +79456,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35423,6 +79497,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35432,6 +79507,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35441,8 +79517,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -35455,79 +79533,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 233 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 502 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 128 LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -35540,7 +79627,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -35548,12 +79635,14 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -35561,10 +79650,17 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35572,6 +79668,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35581,6 +79678,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35590,8 +79688,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -35604,75 +79704,84 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 234 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 503 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -35689,31 +79798,38 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35721,6 +79837,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35730,6 +79847,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35739,8 +79857,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -35753,79 +79873,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 235 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 504 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id017 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -35838,31 +79969,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35870,6 +80010,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35879,6 +80020,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35888,8 +80030,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -35902,79 +80046,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 236 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 505 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -35987,38 +80140,48 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36028,6 +80191,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36037,8 +80201,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -36051,39 +80217,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 237 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 506 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR0_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id016 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -36107,60 +80281,70 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -36168,6 +80352,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36177,6 +80362,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36186,8 +80372,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -36200,39 +80388,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 238 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 507 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -36256,60 +80452,70 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -36317,6 +80523,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36326,6 +80533,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36335,8 +80543,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -36349,46 +80559,54 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 239 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 508 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -36405,29 +80623,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -36435,30 +80654,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -36466,6 +80692,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36475,6 +80702,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36484,8 +80712,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -36498,14 +80728,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 240 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 509 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id022 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -36516,28 +80753,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -36554,15 +80794,16 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 @@ -36576,7 +80817,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -36584,30 +80825,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -36615,6 +80863,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36624,6 +80873,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36633,8 +80883,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -36647,39 +80899,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 241 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 510 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id023 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -36687,45 +80949,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 128 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 4 + LSPB: 32 LVCA: 16 - LVCB: 64 - LVPA: 8 - LVPB: 2 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -36733,30 +80996,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -36764,6 +81036,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36773,6 +81046,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36782,8 +81056,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -36796,46 +81072,54 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 242 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 511 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id024 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -36852,15 +81136,16 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 @@ -36874,7 +81159,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -36882,30 +81167,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -36913,6 +81205,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36922,6 +81215,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36931,8 +81225,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -36945,17 +81241,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 243 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 512 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -36963,15 +81266,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -36985,56 +81291,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 4 - LSPB: 16 + LSPB: 8 LVCA: 64 - LVCB: 16 - LVPA: 2 + LVCB: 32 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -37042,19 +81349,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 3 + NumLoadsB: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -37062,6 +81376,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37071,6 +81386,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37080,6 +81396,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -37094,39 +81411,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 244 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 513 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id026 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -37134,56 +81459,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -37191,19 +81517,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -37211,6 +81544,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37220,6 +81554,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37229,6 +81564,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -37243,33 +81579,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 245 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 514 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -37283,39 +81627,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -37329,10 +81674,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -37340,19 +81685,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -37360,6 +81712,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37369,6 +81722,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37378,6 +81732,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -37392,39 +81747,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 246 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 515 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -37432,39 +81795,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 16 + LSCB: 32 + LSPA: 4 LSPB: 8 - LVCA: 16 + LVCA: 64 LVCB: 32 LVPA: 4 - LVPB: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -37477,11 +81841,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -37489,19 +81853,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -37509,6 +81880,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37518,6 +81890,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37527,6 +81900,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -37541,33 +81915,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 247 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 516 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id023 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -37581,56 +81963,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 128 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 4 + LSPB: 8 LVCA: 16 - LVCB: 64 - LVPA: 8 - LVPB: 2 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -37638,19 +82021,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -37658,6 +82048,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37667,6 +82058,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37676,6 +82068,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -37690,96 +82083,105 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 248 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id024 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 517 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 2 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -37787,19 +82189,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -37807,6 +82214,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37816,6 +82224,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37825,6 +82234,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -37839,33 +82249,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 249 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 518 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -37878,7 +82298,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -37895,40 +82315,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -37936,19 +82357,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -37956,6 +82382,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37965,6 +82392,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37974,6 +82402,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -37988,33 +82417,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 250 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 519 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id026 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -38028,56 +82467,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 4 LSPB: 8 - LVCA: 32 + LVCA: 64 LVCB: 32 - LVPA: 2 - LVPB: 2 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -38085,19 +82525,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38105,6 +82552,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38114,6 +82562,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38123,6 +82572,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -38137,79 +82587,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 251 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 520 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 4 - LSPB: 4 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -38222,11 +82681,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -38234,19 +82693,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38254,6 +82720,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38263,6 +82730,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38272,6 +82740,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -38286,46 +82755,54 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 252 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 521 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id027 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -38334,31 +82811,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -38371,11 +82849,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -38384,18 +82862,25 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38403,6 +82888,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38412,6 +82898,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38421,6 +82908,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -38435,39 +82923,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 253 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 522 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id027 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -38483,48 +82979,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 4 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -38532,19 +83029,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38552,6 +83056,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38561,6 +83066,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38570,6 +83076,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -38584,33 +83091,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 254 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: true - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 523 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id029 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -38623,9 +83138,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -38636,44 +83151,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 64 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -38681,19 +83197,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38701,6 +83222,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38710,6 +83232,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38719,6 +83242,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -38733,39 +83257,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 255 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 524 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id031 - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -38773,8 +83307,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -38782,30 +83316,31 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -38818,11 +83353,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -38830,19 +83365,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38850,6 +83392,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38859,6 +83402,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38868,6 +83412,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -38882,79 +83427,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 256 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 525 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id029 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 4 - LSPB: 4 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -38967,11 +83521,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -38979,19 +83533,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38999,6 +83558,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39008,6 +83568,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39017,6 +83578,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -39031,39 +83593,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 257 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 526 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id030 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -39079,31 +83651,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 32 + LSCB: 32 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 + LVCA: 32 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -39116,11 +83689,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -39128,19 +83701,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39148,6 +83728,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39157,6 +83738,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39166,6 +83748,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -39180,39 +83763,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 258 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 527 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id029 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -39228,31 +83819,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 4 LSPB: 4 - LVCA: 16 - LVCB: 16 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -39265,11 +83857,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -39277,19 +83869,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39297,6 +83896,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39306,6 +83906,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39315,6 +83916,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -39329,33 +83931,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 259 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 528 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id030 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -39368,57 +83978,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 64 + LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -39426,19 +84037,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39446,6 +84062,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39455,6 +84072,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39464,6 +84082,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -39478,39 +84097,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 260 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 529 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id029 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -39518,56 +84147,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -39575,7 +84205,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -39583,11 +84213,18 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39595,6 +84232,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39604,6 +84242,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39613,6 +84252,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -39627,33 +84267,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 261 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 530 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 2] + VectorWidth: 4 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -39667,8 +84315,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -39676,47 +84324,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 64 + LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -39724,19 +84373,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39744,6 +84400,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39753,6 +84410,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39762,6 +84420,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -39776,33 +84435,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 262 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 531 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id029 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -39815,57 +84482,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -39873,19 +84541,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39893,6 +84566,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39902,6 +84576,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39911,6 +84586,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -39925,44 +84601,56 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 263 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 532 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id031 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: false + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -39979,23 +84667,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 4 - LSPB: 2 + LSPB: 4 LVCA: 64 - LVCB: 128 + LVCB: 64 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -40008,11 +84697,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -40020,17 +84709,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -40038,6 +84736,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40047,6 +84746,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40056,6 +84756,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -40070,95 +84771,105 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 264 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 533 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id032 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id035 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -40166,17 +84877,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -40184,6 +84904,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40193,6 +84914,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40202,6 +84924,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -40216,91 +84939,105 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 265 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 534 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - ThreadTile: *id032 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 1 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true + DepthU: 16 + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 + LSCA: 64 + LSCB: 32 LSPA: 8 - LSPB: 1 + LSPB: 16 LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -40308,24 +85045,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40335,6 +85082,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40344,6 +85092,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -40358,74 +85107,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 266 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 535 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id033 + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true + DepthU: 16 + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 - LSPA: 8 - LSPB: 1 - LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -40435,14 +85198,14 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -40450,24 +85213,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40477,6 +85250,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40486,6 +85260,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -40500,45 +85275,54 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 267 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM08 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id032 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 536 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW2_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id033 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -40547,44 +85331,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 256 + LSCB: 32 LSPA: 8 - LSPB: 1 + LSPB: 8 LVCA: 32 - LVCB: 256 + LVCB: 32 LVPA: 8 - LVPB: 1 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 + LdsOffsetA_Blk: 512 LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 256 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 256 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -40592,24 +85381,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40619,6 +85418,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40628,6 +85428,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -40642,33 +85443,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 268 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM64 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 537 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO1_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id032 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -40680,7 +85489,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -40697,36 +85507,41 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 + LSCB: 32 + LSPA: 4 + LSPB: 8 LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -40734,24 +85549,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40761,6 +85586,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40770,6 +85596,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -40784,78 +85611,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 269 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 538 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - ThreadTile: *id032 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -40868,7 +85705,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -40881,16 +85718,25 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -40898,6 +85744,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40907,6 +85754,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40916,6 +85764,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -40930,44 +85779,53 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 270 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 539 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id036 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -40985,39 +85843,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 2 - LVPB: 2 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -41026,17 +85885,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -41044,6 +85910,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41053,6 +85920,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41062,6 +85930,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -41076,33 +85945,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 271 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 540 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -41114,40 +85993,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -41161,10 +86042,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -41174,15 +86055,24 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -41190,6 +86080,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41199,6 +86090,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41208,6 +86100,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -41222,44 +86115,53 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 272 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id036 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 541 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -41277,25 +86179,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 + LVCB: 8 LVPA: 4 - LVPB: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -41304,9 +86211,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -41314,24 +86221,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41341,6 +86256,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41350,6 +86266,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -41364,16 +86281,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 273 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: [8, 4] - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 542 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -41381,78 +86306,82 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id033 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -41460,17 +86389,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -41478,6 +86416,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41487,6 +86426,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41496,6 +86436,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -41510,91 +86451,105 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 274 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - ThreadTile: *id036 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 543 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 1 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -41602,24 +86557,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41629,6 +86594,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41638,6 +86604,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -41652,95 +86619,105 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 275 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 544 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 1 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -41748,17 +86725,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -41766,6 +86752,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41775,6 +86762,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41784,6 +86772,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -41798,33 +86787,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 276 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM08 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - ThreadTile: *id036 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 545 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 1 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -41836,53 +86833,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -41890,24 +86893,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41917,6 +86930,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41926,6 +86940,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -41940,32 +86955,39 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 277 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM08 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 546 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -41973,43 +86995,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 1 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -42022,11 +87049,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -42034,13 +87061,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42050,13 +87080,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42076,8 +87108,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42095,37 +87127,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 278 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM1 + SolutionIndex: 547 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42140,32 +87170,33 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 LSPA: 4 LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -42198,13 +87229,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42214,6 +87248,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -42221,6 +87256,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42240,8 +87276,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42259,37 +87295,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 279 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM8 + SolutionIndex: 548 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42297,49 +87331,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 4 - LSPB: 8 + LSPA: 16 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 1 - LVPB: 2 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -42347,9 +87386,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -42358,13 +87397,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42374,13 +87414,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42400,8 +87442,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42419,8 +87461,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 280 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM64 + SolutionIndex: 549 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -42429,27 +87471,27 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42457,7 +87499,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -42465,45 +87507,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 1 - LVPB: 1 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -42511,10 +87554,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -42522,13 +87565,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42538,6 +87582,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -42545,6 +87590,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42564,8 +87610,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42583,8 +87629,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 281 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM64 + SolutionIndex: 550 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -42592,28 +87638,28 @@ SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42624,57 +87670,62 @@ DepthU: 16 DirectToLds: false DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 16 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 LVCA: 32 - LVCB: 8 - LVPA: 1 + LVCB: 64 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -42682,13 +87733,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42698,13 +87752,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42724,8 +87780,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42743,37 +87799,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 282 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 551 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42788,56 +87842,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 4 + LVCB: 32 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -42846,13 +87901,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42862,6 +87920,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -42869,6 +87928,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42888,8 +87948,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42907,16 +87967,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 283 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 552 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -42927,17 +87987,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42951,37 +88009,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 1 + LVCB: 32 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -42995,10 +88058,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -43006,13 +88069,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43022,13 +88088,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43048,8 +88116,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43067,37 +88135,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 284 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 553 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -43105,43 +88171,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -43154,11 +88225,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -43166,13 +88237,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43182,13 +88256,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43208,8 +88284,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43227,8 +88303,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 285 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 554 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -43237,27 +88313,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -43265,60 +88339,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -43326,13 +88405,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43342,13 +88424,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43368,8 +88452,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43387,15 +88471,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 286 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 555 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -43403,21 +88487,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -43425,39 +88507,40 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -43471,18 +88554,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -43490,13 +88573,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43506,6 +88592,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -43513,6 +88600,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43532,8 +88620,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43551,37 +88639,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 287 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 556 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -43589,43 +88675,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 1 - LVPB: 4 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -43638,11 +88729,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -43650,13 +88741,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43666,13 +88760,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43692,8 +88788,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43711,37 +88807,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 288 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 557 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -43749,43 +88843,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -43798,11 +88897,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -43810,13 +88909,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43826,13 +88928,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43852,8 +88956,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43871,8 +88975,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 289 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 558 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -43881,23 +88985,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -43909,43 +89011,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -43958,11 +89065,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -43970,13 +89077,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43986,13 +89096,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -44012,8 +89124,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44031,8 +89143,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 290 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 559 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -44041,27 +89153,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -44069,14 +89179,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -44093,15 +89203,16 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -44115,18 +89226,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -44134,13 +89245,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44150,6 +89264,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -44157,6 +89272,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -44176,8 +89292,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44195,16 +89311,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 291 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 560 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -44216,12 +89332,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -44233,7 +89347,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -44257,29 +89371,30 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -44287,10 +89402,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -44298,15 +89413,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44316,8 +89432,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -44343,8 +89460,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44362,20 +89479,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 292 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 561 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -44383,14 +89500,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -44404,7 +89521,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -44418,23 +89535,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 32 LSPA: 4 - LSPB: 4 + LSPB: 8 LVCA: 64 - LVCB: 64 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -44449,9 +89571,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -44459,15 +89581,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44477,7 +89600,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -44504,8 +89628,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44523,8 +89647,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 293 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 562 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -44533,17 +89657,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -44551,7 +89675,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -44559,13 +89683,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -44579,40 +89703,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 4 - LSPB: 8 + LSPB: 16 LVCA: 64 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -44620,15 +89749,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44638,7 +89768,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -44665,8 +89796,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44684,15 +89815,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 294 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 563 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] ThreadTile0: 4 @@ -44704,15 +89835,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -44727,40 +89858,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -44775,9 +89907,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -44785,15 +89917,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44803,8 +89934,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -44830,8 +89962,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44849,8 +89981,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 295 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM1 + SolutionIndex: 564 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -44858,26 +89990,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -44885,49 +90019,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -44946,15 +90085,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44964,8 +90102,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -44991,8 +90130,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45010,8 +90149,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 296 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 565 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -45026,15 +90165,17 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -45046,13 +90187,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -45070,25 +90211,30 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -45096,10 +90242,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -45107,15 +90253,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45125,7 +90272,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -45152,8 +90300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45171,35 +90319,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 297 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 566 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 1 - WorkGroup: [16, 8, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -45213,7 +90361,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -45227,23 +90375,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 128 + LSCB: 128 LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 + LSPB: 2 + LVCA: 128 + LVCB: 128 LVPA: 2 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -45257,10 +90410,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -45268,15 +90421,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45286,8 +90440,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -45313,8 +90468,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45332,28 +90487,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 298 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 567 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -45368,13 +90523,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -45392,25 +90547,30 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 + LVCA: 16 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -45418,10 +90578,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -45429,15 +90589,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45447,7 +90608,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -45474,8 +90636,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45493,35 +90655,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 299 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 568 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 1 - WorkGroup: [16, 8, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -45537,7 +90699,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -45545,27 +90707,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 1792 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -45584,9 +90747,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -45594,15 +90757,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45612,8 +90776,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -45639,8 +90804,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45658,8 +90823,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 300 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 569 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -45667,26 +90832,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -45714,27 +90879,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 4 - LSPB: 2 + LSPB: 4 LVCA: 64 - LVCB: 128 + LVCB: 64 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -45749,9 +90915,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -45759,15 +90925,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45777,8 +90944,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -45804,8 +90972,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45823,8 +90991,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 301 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM8 + SolutionIndex: 570 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -45832,26 +91000,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -45865,37 +91033,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -45910,9 +91083,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -45921,14 +91094,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45938,8 +91110,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -45965,8 +91138,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45984,15 +91157,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 302 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 571 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -46000,19 +91173,21 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46026,37 +91201,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -46071,9 +91251,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -46082,14 +91262,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46099,7 +91280,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -46126,8 +91308,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46145,15 +91327,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 303 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 572 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -46161,19 +91343,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46181,60 +91363,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCA: 16 + LSCB: 64 + LSPA: 32 LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -46242,15 +91429,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46260,7 +91448,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -46287,8 +91476,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46306,35 +91495,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 304 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + SolutionIndex: 573 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46342,60 +91531,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -46403,15 +91597,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46421,8 +91616,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -46448,8 +91644,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46467,35 +91663,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 305 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 574 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46503,43 +91699,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCA: 128 + LSCB: 128 + LSPA: 8 LSPB: 8 - LVCA: 64 - LVCB: 16 + LVCA: 32 + LVCB: 32 LVPA: 2 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -46552,11 +91753,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -46564,15 +91765,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46582,8 +91784,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -46609,8 +91812,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46628,35 +91831,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 306 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 575 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW4_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46670,54 +91873,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCA: 32 + LSCB: 64 + LSPA: 16 LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -46726,14 +91934,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46743,7 +91952,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -46770,8 +91980,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46789,35 +91999,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 307 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 576 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46825,13 +92035,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -46845,40 +92055,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 4 - LSPB: 8 + LSPB: 16 LVCA: 64 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -46886,15 +92101,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46904,7 +92120,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -46931,8 +92148,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46950,15 +92167,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 308 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + SolutionIndex: 577 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] ThreadTile0: 4 @@ -46970,15 +92187,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46986,56 +92203,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -47047,15 +92269,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47065,8 +92286,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -47092,8 +92314,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47111,8 +92333,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 309 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + SolutionIndex: 578 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -47127,19 +92349,21 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47147,56 +92371,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -47208,7 +92437,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 NumLoadsB: 2 @@ -47216,7 +92445,8 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47226,7 +92456,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -47253,8 +92484,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47272,8 +92503,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 310 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + SolutionIndex: 579 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -47288,19 +92519,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47308,49 +92539,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -47358,9 +92594,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -47369,15 +92605,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47387,8 +92622,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -47414,8 +92650,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47433,14 +92669,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 311 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + SolutionIndex: 580 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -47449,19 +92685,21 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47469,60 +92707,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -47530,15 +92773,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47548,7 +92790,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -47575,8 +92818,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47594,35 +92837,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 312 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + SolutionIndex: 581 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47630,7 +92875,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -47654,40 +92899,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -47695,15 +92941,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47713,6 +92960,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -47740,8 +92988,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47759,20 +93007,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 313 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 582 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -47780,14 +93028,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47795,7 +93043,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -47803,31 +93051,32 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -47841,18 +93090,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -47860,15 +93109,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47878,6 +93128,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -47905,8 +93156,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47924,16 +93175,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 314 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + SolutionIndex: 583 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -47944,15 +93195,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47960,39 +93211,40 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -48006,18 +93258,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48025,13 +93277,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48041,6 +93296,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -48068,8 +93324,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48087,33 +93343,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 315 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + SolutionIndex: 584 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -48125,7 +93379,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -48133,39 +93387,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48178,11 +93433,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48190,15 +93445,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48208,6 +93464,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -48235,8 +93492,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48254,8 +93511,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 316 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM1 + SolutionIndex: 585 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -48263,26 +93520,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -48298,39 +93555,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 - LVPA: 2 + LVCB: 32 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48344,9 +93602,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -48355,15 +93613,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48373,6 +93632,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -48400,8 +93660,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48419,8 +93679,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 317 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 586 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -48428,18 +93688,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -48447,7 +93707,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -48462,57 +93722,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 + LSCA: 64 + LSCB: 16 LSPA: 8 LSPB: 16 LVCA: 32 LVCB: 16 - LVPA: 2 - LVPB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48520,13 +93781,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48536,6 +93800,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -48563,8 +93828,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48582,37 +93847,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 318 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 587 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -48628,56 +93891,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 + LSCA: 64 + LSCB: 32 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48685,15 +93949,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48703,6 +93968,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -48730,8 +93996,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48749,35 +94015,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 319 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + SolutionIndex: 588 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -48785,47 +94051,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48838,11 +94105,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48850,13 +94117,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48866,6 +94136,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -48893,8 +94164,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48912,8 +94183,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 320 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + SolutionIndex: 589 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -48921,28 +94192,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -48950,7 +94219,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -48958,31 +94227,32 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -48996,18 +94266,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49015,15 +94285,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49033,6 +94304,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -49060,8 +94332,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49079,16 +94351,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 321 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM8 + SolutionIndex: 590 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -49099,15 +94371,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49122,57 +94394,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49180,15 +94453,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49198,6 +94470,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -49225,8 +94498,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49244,35 +94517,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 322 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + SolutionIndex: 591 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49287,7 +94562,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -49304,23 +94579,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49335,9 +94611,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49345,13 +94621,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49361,6 +94640,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -49388,8 +94668,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49407,8 +94687,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 323 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + SolutionIndex: 592 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -49416,12 +94696,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -49429,15 +94709,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49445,14 +94723,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -49469,40 +94747,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49510,15 +94789,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49528,6 +94806,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -49555,8 +94834,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49574,20 +94853,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 324 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 593 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -49595,14 +94874,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49610,7 +94891,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -49618,45 +94899,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -49664,9 +94946,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -49675,13 +94957,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49691,6 +94974,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -49718,8 +95002,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49737,37 +95021,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 325 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 594 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49775,7 +95059,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -49799,29 +95083,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -49829,10 +95114,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49840,15 +95125,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49858,6 +95144,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -49885,8 +95172,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49904,35 +95191,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 326 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + SolutionIndex: 595 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49940,64 +95227,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 + LSCA: 32 + LSCB: 64 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 2 - LVPB: 2 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50005,13 +95293,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -50021,6 +95312,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -50048,8 +95340,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50067,33 +95359,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 327 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + SolutionIndex: 596 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_8_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -50105,65 +95395,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCB: 32 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50171,13 +95461,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -50189,7 +95482,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -50215,8 +95508,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50234,33 +95527,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 328 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WGM8 + SolutionIndex: 597 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -50272,7 +95563,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -50299,38 +95590,38 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 8 - LSPB: 4 + LSPB: 8 LVCA: 32 - LVCB: 64 + LVCB: 32 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50338,15 +95629,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -50384,175 +95676,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 329 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM8 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingType: B - _staggerStrideShift: 3 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - InnerUnroll: 1 - InterleaveAlpha: 0 - KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 - LVPA: 2 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 - LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 NumIndicesSummation: 1 - NumIndicesLD: 4 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50570,14 +95695,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 330 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WGM8 + SolutionIndex: 598 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -50586,15 +95711,15 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -50606,14 +95731,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -50632,28 +95757,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -50661,10 +95786,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50672,13 +95797,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -50716,8 +95844,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50735,15 +95863,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 331 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 + SolutionIndex: 599 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -50756,12 +95884,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - [2, 3, 0, 1] - - - [1024, 128, 1, 128] - [4, 1028.02] @@ -50897,8 +96023,6 @@ - [83, 6071.16] - - [6784, 6784, 1, 1280] - [80, 9535.64] - - - [1024, 256, 1, 3328] - - [74, 5742.58] - - [1408, 4288, 1, 1280] - [83, 8254.99] - - [3584, 4288, 1, 1280] @@ -52377,8 +97501,6 @@ - [56, 5129.81] - - [2368, 3584, 1, 256] - [74, 8998.7] - - - [1024, 256, 1, 1280] - - [81, 3566.58] - - [5056, 3584, 1, 1280] - [75, 9345.07] - - [448, 4, 1, 3328] @@ -54871,4112 +99993,5872 @@ - [162, 5765.37] - - [4096, 3072, 1, 128] - [164, 8869.01] + - - [768, 3072, 1, 4096] + - [176, 10028.7] + - - [64, 256, 192, 256] + - [170, 8791.55] + - - [768, 2, 1, 16] + - [173, 4.95484] + - - [768, 768, 1, 64] + - [169, 3469.55] + - - [768, 768, 1, 4096] + - [177, 7475.0] + - - [768, 30522, 1, 1280] + - [180, 10296.9] + - - [64, 128, 384, 128] + - [170, 7660.83] + - - [768, 30522, 1, 320] + - [178, 10007.9] + - - [768, 768, 1, 32] + - [167, 2359.3] + - - [3072, 768, 1, 4096] + - [176, 10033.7] + - - [768, 30522, 1, 640] + - [179, 10206.7] + - - [64, 64, 768, 64] + - [168, 5494.72] + - - [768, 768, 1, 640] + - [177, 6721.64] + - - [768, 768, 1, 16] + - [166, 1203.72] + - - [768, 768, 1, 1280] + - [175, 7138.57] + - - [768, 2, 1, 32] + - [171, 11.8154] + - - [2048, 2048, 1, 512] + - [191, 9607.57] + - - [512, 32, 1, 200] + - [184, 422.268] + - - [1024, 1, 1, 200] + - [187, 24.6154] + - - [1600, 1024, 1, 512] + - [182, 8115.91] + - - [560, 1024, 1, 200] + - [181, 4810.74] + - - [1024, 1024, 1, 512] + - [190, 8614.74] + - - [2048, 1, 1, 512] + - [185, 80.9086] + - - [512, 512, 1, 200] + - [183, 4398.39] + - - [100, 2048, 1, 512] + - [188, 4443.12] + - - [1024, 1024, 1, 200] + - [189, 6990.51] + - - [1024, 64, 1, 512] + - [186, 2853.27] + - - [1024, 256, 1, 18944] + - [210, 9196.41] + - - [256, 3328, 1, 8976] + - [200, 8299.26] + - - [1024, 256, 1, 4352] + - [208, 8813.74] + - - [256, 9728, 1, 8976] + - [203, 9638.48] + - - [1024, 256, 1, 3072] + - [210, 8640.63] + - - [768, 2048, 1, 256] + - [202, 8662.93] + - - [1024, 256, 1, 19968] + - [207, 9220.86] + - - [256, 12800, 1, 8976] + - [197, 9418.42] + - - [1024, 256, 1, 3328] + - [211, 8682.48] + - - [256, 10240, 1, 8976] + - [204, 10137.7] + - - [1024, 256, 1, 15104] + - [209, 9167.03] + - - [256, 10496, 1, 8976] + - [197, 9858.38] + - - [1024, 256, 1, 2816] + - [212, 8575.71] + - - [1024, 256, 1, 4608] + - [207, 8861.21] + - - [256, 11264, 1, 8976] + - [194, 9627.69] + - - [1024, 256, 1, 6400] + - [207, 8985.23] + - - [1024, 256, 1, 16128] + - [207, 9170.26] + - - [256, 44505, 1, 8976] + - [201, 10331.8] + - - [256, 6144, 1, 8976] + - [204, 10395.0] + - - [1024, 256, 1, 5120] + - [209, 8881.53] + - - [1024, 256, 1, 7936] + - [212, 9023.14] + - - [256, 3840, 1, 8976] + - [199, 9541.28] + - - [1024, 256, 1, 21248] + - [207, 9209.72] + - - [1024, 256, 1, 12032] + - [209, 9156.17] + - - [256, 8192, 1, 8976] + - [206, 10374.4] + - - [1024, 256, 1, 3584] + - [208, 8712.2] + - - [1024, 256, 1, 14336] + - [209, 9162.51] + - - [256, 7168, 1, 8976] + - [195, 9554.86] + - - [1024, 256, 1, 13568] + - [207, 9165.04] + - - [256, 4096, 1, 8976] + - [199, 10146.6] + - - [1024, 256, 1, 4096] + - [208, 8783.88] + - - [256, 2560, 1, 8976] + - [198, 8381.56] + - - [256, 20992, 1, 8976] + - [197, 9989.86] + - - [256, 4352, 1, 8976] + - [198, 9634.92] + - - [256, 33536, 1, 8976] + - [197, 10218.1] + - - [256, 3584, 1, 8976] + - [199, 8924.5] + - - [256, 26112, 1, 8976] + - [198, 10272.3] + - - [256, 14336, 1, 8976] + - [202, 10217.3] + - - [1024, 256, 1, 14848] + - [209, 9185.19] + - - [1024, 256, 1, 8448] + - [210, 9025.89] + - - [1024, 256, 1, 28672] + - [207, 9256.4] + - - [1024, 256, 1, 5632] + - [207, 8932.69] + - - [256, 22016, 1, 8976] + - [202, 10151.9] + - - [1024, 256, 1, 33536] + - [207, 9243.07] + - - [256, 5120, 1, 8976] + - [193, 9418.05] + - - [256, 11520, 1, 8976] + - [200, 9701.0] + - - [256, 19968, 1, 8976] + - [198, 10228.0] + - - [1024, 256, 1, 5376] + - [209, 8892.52] + - - [1024, 256, 1, 22016] + - [207, 9244.24] + - - [256, 8960, 1, 8976] + - [198, 9841.31] + - - [1024, 256, 1, 15872] + - [207, 9223.15] + - - [256, 17408, 1, 8976] + - [202, 9785.77] + - - [256, 5632, 1, 8976] + - [202, 9564.22] + - - [256, 32512, 1, 8976] + - [201, 10357.9] + - - [256, 11008, 1, 8976] + - [194, 9445.13] + - - [1024, 256, 1, 6144] + - [209, 8955.81] + - - [256, 4864, 1, 8976] + - [194, 8979.35] + - - [256, 15104, 1, 8976] + - [197, 10007.0] + - - [1024, 256, 1, 9984] + - [207, 9110.43] + - - [256, 1280, 1, 8976] + - [193, 5944.34] + - - [1024, 256, 1, 1024] + - [209, 7005.1] + - - [1024, 256, 1, 9728] + - [209, 9066.19] + - - [1024, 256, 1, 10496] + - [207, 9118.05] + - - [256, 11776, 1, 8976] + - [204, 9911.64] + - - [256, 12544, 1, 8976] + - [197, 9235.25] + - - [1024, 256, 1, 17152] + - [207, 9152.21] + - - [1024, 256, 1, 11520] + - [209, 9146.77] + - - [1024, 256, 1, 21504] + - [209, 9207.42] + - - [256, 17152, 1, 8976] + - [196, 9654.71] + - - [1024, 256, 1, 17408] + - [207, 9181.17] + - - [256, 15872, 1, 8976] + - [205, 10086.4] + - - [256, 18688, 1, 8976] + - [198, 9612.47] + - - [256, 5888, 1, 8976] + - [202, 9988.33] + - - [512, 2048, 1, 256] + - [192, 7678.36] + - - [1024, 256, 1, 7680] + - [210, 9032.96] + - - [1024, 256, 1, 1280] + - [212, 7767.23] + - - [256, 14848, 1, 8976] + - [198, 9852.66] + - - [256, 9984, 1, 8976] + - [204, 9908.87] + - - [256, 20480, 1, 8976] + - [202, 10337.1] + - - [1024, 256, 1, 8192] + - [209, 9044.32] + - - [1024, 256, 1, 19712] + - [208, 9184.18] + - - [256, 13568, 1, 8976] + - [198, 9927.82] + - - [256, 13312, 1, 8976] + - [197, 9757.91] + - - [256, 2816, 1, 8976] + - [197, 9191.43] + - - [1024, 256, 1, 2304] + - [208, 8444.91] + - - [256, 21248, 1, 8976] + - [198, 10127.5] + - - [256, 16128, 1, 8976] + - [206, 10238.4] + - - [256, 512, 36, 98] + - [229, 7994.85] + - - [64, 192, 36, 25088] + - [298, 8613.89] + - - [128, 128, 64, 25] + - [228, 2540.15] + - - [256, 256, 64, 56] + - [229, 6924.56] + - - [512, 486, 36, 800] + - [236, 8994.84] + - - [512, 512, 36, 1568] + - [247, 9872.38] + - - [64, 192, 64, 3200] + - [292, 9295.89] + - - [256, 384, 36, 4096] + - [292, 9334.61] + - - [128, 256, 64, 32] + - [231, 4279.9] + - - [64, 128, 64, 23104] + - [298, 10103.1] + - - [128, 256, 64, 9] + - [222, 1709.63] + - - [256, 512, 36, 784] + - [232, 9520.73] + - - [256, 324, 36, 32] + - [270, 4473.38] + - - [512, 512, 36, 33] + - [241, 5925.17] + - - [16, 32, 36, 5760] + - [245, 1448.8] + - - [192, 384, 64, 128] + - [292, 8618.43] + - - [512, 512, 64, 72] + - [248, 8260.12] + - - [128, 128, 64, 1600] + - [221, 9008.38] + - - [512, 512, 36, 128] + - [292, 8871.62] + - - [192, 384, 64, 2304] + - [221, 9657.16] + - - [384, 256, 64, 450] + - [257, 9538.93] + - - [3, 64, 36, 6272] + - [245, 509.784] + - - [3, 64, 64, 2888] + - [274, 708.621] + - - [384, 256, 64, 2304] + - [257, 10287.5] + - - [512, 512, 64, 144] + - [292, 9226.7] + - - [256, 256, 36, 6272] + - [232, 9607.28] + - - [80, 192, 64, 4608] + - [293, 7347.93] + - - [64, 64, 36, 3136] + - [280, 5959.05] + - - [256, 384, 64, 2304] + - [257, 10283.4] + - - [512, 512, 36, 66] + - [241, 7618.08] + - - [128, 256, 64, 800] + - [267, 9611.15] + - - [64, 128, 36, 30] + - [223, 1242.61] + - - [192, 256, 36, 512] + - [292, 8657.97] + - - [256, 512, 64, 200] + - [292, 9153.87] + - - [256, 512, 64, 25] + - [270, 5349.88] + - - [3, 64, 64, 46208] + - [273, 808.562] + - - [128, 256, 36, 1568] + - [265, 8528.62] + - - [64, 128, 64, 11552] + - [298, 9997.0] + - - [128, 192, 64, 946] + - [292, 9198.38] + - - [64, 192, 64, 12800] + - [253, 9000.66] + - - [224, 224, 64, 128] + - [230, 6312.07] + - - [128, 256, 64, 288] + - [292, 8697.87] + - - [64, 64, 64, 826] + - [235, 6650.21] + - - [256, 384, 64, 1152] + - [267, 10106.8] + - - [3, 64, 64, 92416] + - [273, 812.031] + - - [32, 32, 36, 43808] + - [214, 2813.09] + - - [160, 320, 64, 288] + - [224, 8090.86] + - - [1, 16, 36, 23040] + - [261, 42.6667] + - - [128, 256, 36, 128] + - [239, 6049.48] + - - [128, 128, 64, 3360] + - [292, 9199.96] + - - [128, 128, 64, 420] + - [292, 8131.5] + - - [64, 128, 64, 361] + - [229, 6937.98] + - - [512, 512, 36, 16] + - [285, 3797.66] + - - [384, 256, 36, 800] + - [226, 9151.65] + - - [192, 384, 36, 4096] + - [226, 8867.57] + - - [64, 64, 64, 1600] + - [278, 7931.74] + - - [256, 384, 64, 576] + - [258, 9745.8] + - - [512, 512, 64, 14] + - [241, 3638.18] + - - [512, 512, 36, 8] + - [216, 2279.51] + - - [512, 486, 64, 128] + - [232, 8337.83] + - - [1, 16, 64, 640] + - [266, 49.9512] + - - [64, 96, 64, 288] + - [291, 5707.97] + - - [96, 96, 36, 1568] + - [260, 6866.75] + - - [256, 256, 36, 128] + - [264, 7703.82] + - - [64, 128, 36, 53824] + - [252, 6331.31] + - - [256, 256, 36, 32] + - [248, 4648.86] + - - [192, 256, 64, 288] + - [292, 8987.79] + - - [256, 256, 36, 16] + - [262, 2912.71] + - - [128, 256, 36, 3200] + - [265, 8680.27] + - - [160, 320, 64, 512] + - [224, 8449.44] + - - [128, 160, 36, 512] + - [235, 7214.97] + - - [96, 96, 36, 2592] + - [230, 7104.79] + - - [64, 96, 64, 800] + - [260, 7268.32] + - - [147, 64, 36, 18816] + - [276, 7116.26] + - - [160, 320, 36, 512] + - [230, 7874.82] + - - [256, 512, 36, 4] + - [269, 1034.78] + - - [96, 128, 64, 946] + - [252, 7901.07] + - - [256, 324, 64, 1568] + - [257, 8589.53] + - - [128, 128, 64, 50] + - [248, 4070.56] + - - [35, 96, 36, 8960] + - [242, 4207.3] + - - [32, 64, 36, 43808] + - [283, 4390.81] + - - [160, 224, 36, 128] + - [230, 5446.92] + - - [64, 64, 64, 81] + - [255, 2391.18] + - - [256, 256, 36, 3200] + - [221, 9559.55] + - - [256, 256, 36, 210] + - [232, 8414.61] + - - [192, 384, 64, 576] + - [292, 9468.75] + - - [512, 512, 64, 800] + - [267, 10096.4] + - - [512, 24, 36, 800] + - [218, 4761.77] + - - [64, 64, 64, 13216] + - [279, 8491.41] + - - [192, 224, 64, 1152] + - [235, 8769.06] + - - [256, 256, 64, 1152] + - [257, 9988.09] + - - [512, 486, 64, 512] + - [267, 9254.67] + - - [128, 128, 36, 784] + - [230, 7468.06] + - - [256, 512, 64, 1600] + - [254, 10232.5] + - - [512, 512, 64, 9] + - [248, 2599.78] + - - [96, 128, 64, 288] + - [260, 6599.43] + - - [64, 96, 36, 512] + - [260, 5073.75] + - - [256, 512, 36, 1568] + - [292, 9637.81] + - - [128, 128, 64, 400] + - [292, 8192.0] + - - [128, 128, 64, 800] + - [292, 8716.34] + - - [96, 128, 36, 512] + - [280, 6756.93] + - - [16, 32, 36, 360] + - [243, 754.036] + - - [128, 256, 64, 3200] + - [257, 10222.5] + - - [96, 128, 64, 800] + - [260, 7967.9] + - - [256, 512, 64, 4] + - [222, 1097.99] + - - [256, 256, 64, 450] + - [267, 9347.45] + - - [64, 64, 64, 3200] + - [278, 8518.08] + - - [192, 224, 64, 128] + - [238, 7035.17] + - - [128, 128, 64, 288] + - [292, 7751.28] + - - [256, 256, 64, 72] + - [248, 7489.83] + - - [96, 208, 36, 512] + - [260, 6939.11] + - - [128, 256, 36, 3136] + - [235, 8669.33] + - - [64, 64, 36, 3520] + - [230, 6007.47] + - - [64, 128, 36, 1568] + - [293, 6897.7] + - - [160, 320, 64, 242] + - [219, 7873.17] + - - [192, 192, 36, 512] + - [230, 7707.32] + - - [512, 512, 36, 512] + - [292, 9582.42] + - - [1, 16, 64, 10240] + - [244, 71.3511] + - - [128, 128, 36, 512] + - [230, 7149.38] + - - [512, 512, 36, 256] + - [221, 9384.4] + - - [512, 512, 36, 1024] + - [215, 9777.89] + - - [96, 208, 64, 1152] + - [293, 7850.9] + - - [128, 192, 64, 3200] + - [221, 9490.82] + - - [256, 256, 36, 4096] + - [226, 9585.46] + - - [160, 160, 64, 288] + - [260, 7299.8] + - - [256, 256, 64, 896] + - [257, 9850.33] + - - [128, 256, 64, 242] + - [292, 8391.38] + - - [128, 128, 36, 440] + - [235, 6274.72] + - - [96, 128, 36, 1568] + - [280, 7875.03] + - - [192, 384, 36, 1024] + - [226, 8715.72] + - - [64, 96, 36, 10368] + - [297, 7478.59] + - - [128, 256, 64, 100] + - [241, 7084.97] + - - [112, 224, 36, 2048] + - [234, 7555.92] + - - [384, 256, 64, 1152] + - [257, 10102.3] + - - [192, 384, 36, 128] + - [292, 7543.04] + - - [128, 128, 36, 7040] + - [265, 7600.6] + - - [128, 256, 64, 1568] + - [257, 10005.9] + - - [128, 128, 36, 1568] + - [249, 7848.3] + - - [128, 256, 64, 72] + - [272, 6553.6] + - - [256, 256, 36, 12544] + - [286, 9365.04] + - - [256, 256, 36, 105] + - [248, 7286.06] + - - [128, 256, 36, 392] + - [235, 7625.69] + - - [64, 64, 64, 5408] + - [278, 8882.67] + - - [3, 64, 36, 25088] + - [245, 528.942] + - - [384, 256, 36, 1024] + - [292, 9182.75] + - - [35, 96, 36, 13440] + - [299, 4110.29] + - - [128, 256, 64, 1152] + - [257, 9804.87] + - - [256, 324, 64, 32] + - [270, 5043.63] + - - [160, 224, 64, 128] + - [284, 6046.15] + - - [192, 224, 36, 2592] + - [282, 8878.68] + - - [96, 96, 64, 1152] + - [260, 8035.45] + - - [32, 64, 36, 90] + - [217, 964.465] + - - [64, 128, 64, 2888] + - [232, 9047.23] + - - [256, 384, 36, 800] + - [292, 9154.02] + - - [512, 512, 64, 4] + - [289, 1233.62] + - - [192, 320, 36, 128] + - [229, 7388.19] + - - [64, 128, 36, 480] + - [293, 5653.27] + - - [192, 384, 64, 242] + - [292, 9079.99] + - - [256, 486, 64, 32] + - [285, 5909.18] + - - [147, 64, 64, 9702] + - [294, 7319.69] + - - [512, 512, 64, 64] + - [228, 8179.02] + - - [64, 192, 64, 3698] + - [221, 9287.89] + - - [73, 192, 64, 10439] + - [252, 6668.02] + - - [1, 16, 36, 1440] + - [268, 33.4452] + - - [128, 256, 36, 512] + - [235, 7989.15] + - - [512, 512, 64, 576] + - [267, 9951.89] + - - [64, 64, 36, 12544] + - [283, 5872.77] + - - [128, 128, 36, 880] + - [280, 7597.26] + - - [192, 224, 36, 128] + - [238, 6451.2] + - - [64, 64, 64, 800] + - [278, 6916.73] + - - [64, 128, 36, 12544] + - [256, 6395.88] + - - [64, 64, 36, 1568] + - [230, 5536.66] + - - [160, 160, 36, 512] + - [230, 7345.26] + - - [512, 24, 64, 512] + - [220, 5242.88] + - - [3, 64, 36, 3136] + - [245, 475.352] + - - [256, 256, 64, 9] + - [270, 2106.51] + - - [3, 64, 64, 11552] + - [273, 785.127] + - - [128, 256, 36, 12544] + - [288, 8792.13] + - - [128, 128, 36, 3136] + - [249, 8098.46] + - - [256, 512, 36, 3136] + - [232, 9694.39] + - - [64, 64, 36, 196] + - [246, 2757.76] + - - [144, 288, 36, 512] + - [280, 7077.89] + - - [256, 24, 64, 32] + - [259, 1483.83] + - - [384, 384, 36, 800] + - [221, 9246.5] + - - [512, 512, 64, 1600] + - [267, 10277.3] + - - [112, 224, 36, 512] + - [235, 6744.78] + - - [128, 128, 36, 49] + - [241, 2716.29] + - - [512, 512, 36, 4] + - [269, 1156.52] + - - [35, 96, 64, 4235] + - [230, 4631.28] + - - [192, 384, 64, 450] + - [221, 9372.2] + - - [256, 256, 36, 1024] + - [292, 9346.64] + - - [112, 224, 64, 1152] + - [235, 7523.95] + - - [256, 512, 64, 400] + - [254, 9597.95] + - - [149, 32, 36, 19072] + - [299, 5811.8] + - - [128, 256, 36, 6272] + - [235, 8754.68] + - - [128, 192, 36, 1568] + - [260, 8195.1] + - - [256, 256, 36, 512] + - [292, 9074.22] + - - [256, 256, 64, 112] + - [292, 8305.55] + - - [512, 512, 64, 18] + - [285, 4324.02] + - - [256, 256, 64, 18] + - [248, 3547.81] + - - [256, 256, 64, 1568] + - [257, 10141.7] + - - [64, 96, 36, 1568] + - [278, 6805.66] + - - [384, 256, 36, 4096] + - [292, 9311.1] + - - [256, 512, 64, 800] + - [267, 9998.35] + - - [256, 384, 36, 2048] + - [292, 9285.34] + - - [3, 64, 36, 200704] + - [274, 547.375] + - - [384, 384, 64, 2304] + - [215, 9901.68] + - - [160, 320, 64, 128] + - [251, 7113.81] + - - [512, 512, 36, 528] + - [221, 9567.65] + - - [160, 320, 36, 128] + - [252, 6411.13] + - - [96, 96, 64, 800] + - [260, 7690.01] + - - [256, 512, 36, 49] + - [248, 6721.25] + - - [384, 384, 64, 450] + - [221, 9523.53] + - - [3, 64, 64, 23104] + - [273, 801.621] + - - [256, 256, 64, 3200] + - [257, 10300.4] + - - [128, 192, 36, 512] + - [235, 7499.75] + - - [192, 192, 64, 288] + - [292, 8774.24] + - - [96, 208, 64, 242] + - [252, 5901.99] + - - [256, 16, 36, 3200] + - [281, 3807.77] + - - [512, 512, 64, 8] + - [259, 2379.75] + - - [64, 128, 64, 5776] + - [232, 9332.74] + - - [512, 512, 64, 288] + - [221, 9521.99] + - - [256, 16, 36, 32] + - [277, 766.005] + - - [128, 192, 64, 288] + - [292, 8527.58] + - - [32, 64, 64, 640] + - [260, 4660.34] + - - [64, 64, 36, 392] + - [260, 3686.4] + - - [384, 384, 36, 1024] + - [226, 9282.48] + - - [64, 64, 36, 11552] + - [290, 5904.78] + - - [96, 128, 36, 6272] + - [280, 8350.99] + - - [128, 256, 36, 16] + - [262, 2144.81] + - - [256, 256, 64, 288] + - [292, 9140.13] + - - [64, 64, 64, 1652] + - [278, 7766.53] + - - [256, 384, 36, 1024] + - [226, 9203.27] + - - [96, 128, 64, 3200] + - [295, 8866.2] + - - [256, 324, 36, 3200] + - [234, 8194.25] + - - [128, 192, 64, 800] + - [292, 9198.03] + - - [64, 128, 64, 10] + - [233, 851.117] + - - [96, 208, 64, 288] + - [260, 6667.58] + - - [64, 96, 36, 2592] + - [242, 7216.88] + - - [64, 128, 64, 160] + - [271, 5190.97] + - - [192, 384, 64, 512] + - [221, 9446.04] + - - [64, 64, 36, 6272] + - [230, 6212.01] + - - [512, 24, 36, 288] + - [227, 3922.47] + - - [128, 128, 64, 1568] + - [221, 9037.86] + - - [112, 224, 64, 242] + - [291, 6399.26] + - - [128, 256, 64, 1600] + - [257, 10010.3] + - - [32, 32, 64, 20000] + - [225, 4378.41] + - - [160, 192, 64, 288] + - [252, 7803.63] + - - [512, 24, 64, 128] + - [213, 3733.8] + - - [512, 512, 36, 32] + - [248, 5935.34] + - - [3, 64, 36, 100352] + - [245, 542.783] + - - [3, 64, 64, 1444] + - [274, 674.159] + - - [512, 512, 36, 3136] + - [215, 9921.1] + - - [128, 256, 64, 6400] + - [275, 10349.3] + - - [256, 256, 36, 2048] + - [292, 9518.99] + - - [128, 160, 64, 288] + - [235, 7549.75] + - - [256, 256, 64, 6400] + - [257, 10392.6] + - - [32, 64, 64, 20000] + - [283, 6493.86] + - - [256, 256, 36, 1680] + - [232, 9513.29] + - - [128, 128, 64, 210] + - [292, 7094.1] + - - [192, 384, 36, 2048] + - [221, 8818.65] + - - [256, 256, 64, 144] + - [292, 8608.61] + - - [384, 384, 36, 4096] + - [226, 9356.94] + - - [160, 320, 64, 1152] + - [252, 8749.48] + - - [384, 256, 36, 2048] + - [292, 9279.63] + - - [256, 512, 36, 392] + - [292, 9252.14] + - - [256, 512, 64, 50] + - [248, 7511.29] + - - [73, 192, 36, 23360] + - [296, 5802.93] + - - [3, 64, 36, 50176] + - [245, 542.037] + - - [384, 384, 36, 2048] + - [221, 9325.8] + - - [256, 384, 64, 450] + - [267, 9528.66] + - - [192, 320, 64, 128] + - [226, 8399.81] + - - [128, 256, 36, 32] + - [241, 3276.8] + - - [160, 192, 36, 512] + - [280, 7752.34] + - - [512, 512, 64, 256] + - [232, 9473.64] + - - [256, 512, 64, 32] + - [270, 6391.32] + - - [384, 384, 64, 576] + - [221, 9614.79] + - - [64, 64, 64, 648] + - [278, 6282.15] + - - [512, 486, 36, 288] + - [292, 8624.93] + - - [32, 64, 36, 1440] + - [230, 3961.5] + - - [144, 288, 64, 242] + - [252, 6347.02] + - - [384, 256, 64, 576] + - [257, 9775.24] + - - [512, 512, 36, 64] + - [228, 7791.28] + - - [448, 384, 64, 128] + - [221, 9132.23] + - - [64, 128, 64, 722] + - [271, 8047.11] + - - [144, 288, 64, 288] + - [280, 6859.4] + - - [512, 512, 64, 224] + - [292, 9427.29] + - - [112, 224, 64, 288] + - [291, 6736.92] + - - [384, 384, 64, 1152] + - [215, 9820.46] + - - [448, 384, 36, 128] + - [292, 8761.31] + - - [64, 64, 64, 100] + - [238, 2708.1] + - - [256, 486, 36, 128] + - [264, 7640.04] + - - [64, 96, 64, 4608] + - [293, 8351.49] + - - [16, 32, 64, 160] + - [217, 736.36] + - - [64, 192, 36, 6272] + - [293, 8041.19] + - - [64, 64, 64, 200] + - [246, 3924.31] + - - [256, 256, 36, 800] + - [292, 9299.55] + - - [64, 128, 36, 6272] + - [290, 6816.36] + - - [32, 64, 64, 40] + - [237, 885.622] + - - [256, 16, 64, 32] + - [287, 1205.26] + - - [192, 384, 36, 800] + - [226, 8673.88] + - - [128, 128, 36, 3200] + - [260, 8538.89] + - - [256, 256, 36, 256] + - [232, 8454.36] + - - [192, 384, 64, 1152] + - [221, 9589.01] + - - [128, 256, 64, 200] + - [231, 8141.12] + - - [64, 96, 64, 1152] + - [260, 7620.88] + - - [128, 128, 36, 392] + - [235, 6175.51] + - - [80, 192, 36, 10368] + - [283, 6497.16] + - - [224, 224, 36, 128] + - [293, 5826.89] + - - [512, 512, 64, 28] + - [248, 5728.81] + - - [256, 16, 64, 1568] + - [263, 4637.2] + - - [144, 288, 64, 1152] + - [280, 7784.24] + - - [256, 256, 64, 576] + - [257, 9596.12] + - - [64, 128, 36, 784] + - [293, 6058.99] + - - [256, 24, 36, 128] + - [227, 2239.84] + - - [256, 256, 64, 2304] + - [257, 10225.7] + - - [192, 384, 36, 512] + - [292, 8549.03] + - - [16, 32, 64, 2560] + - [245, 2153.13] + - - [256, 512, 36, 32] + - [270, 5702.23] + - - [512, 512, 64, 128] + - [292, 9084.11] + - - [128, 128, 64, 200] + - [229, 6971.91] + - - [512, 512, 64, 32] + - [241, 6248.5] + - - [128, 256, 36, 196] + - [241, 6628.76] + - - [8, 384, 64, 6600] + - [273, 2733.89] + - - [149, 32, 64, 8195] + - [235, 6050.91] + - - [35, 96, 64, 6160] + - [280, 4689.35] + - - [64, 64, 36, 1760] + - [230, 5622.24] - - [1024, 128, 1, 128] - - [170, 1028.12] + - [304, 1028.12] - - [4, 704, 1, 1280] - - [209, 363.455] + - [343, 363.455] - - [4, 1856, 1, 3328] - - [209, 579.534] + - [343, 579.534] - - [1856, 448, 1, 3328] - - [246, 6966.83] + - [380, 6966.83] - - [2944, 4288, 1, 1280] - - [241, 9057.98] + - [375, 9057.98] - - [2368, 64, 1, 3328] - - [202, 5837.66] + - [336, 5837.66] - - [2368, 5888, 1, 256] - - [246, 9111.16] + - [380, 9111.16] - - [128, 64, 1, 256] - - [208, 374.591] + - [342, 374.591] - - [5888, 1024, 1, 1280] - - [251, 8570.54] + - [385, 8570.54] - - [128, 6784, 1, 3328] - - [214, 7703.96] + - [348, 7703.96] - - [64, 4, 1, 256] - - [260, 11.3219] + - [394, 11.3219] - - [5888, 1856, 1, 3328] - - [246, 9394.4] + - [380, 9394.4] - - [5056, 704, 1, 256] - - [249, 8026.99] + - [383, 8026.99] - - [5888, 2944, 1, 3328] - - [239, 7608.21] + - [373, 7608.21] - - [1856, 4288, 1, 256] - - [240, 8986.42] + - [374, 8986.42] - - [1024, 5056, 1, 128] - - [232, 3898.34] + - [366, 3898.34] - - [5056, 5056, 1, 3328] - - [240, 9536.85] + - [374, 9536.85] - - [1408, 5888, 1, 1280] - - [241, 9279.19] + - [375, 9279.19] - - [2368, 448, 1, 128] - - [233, 2474.42] + - [367, 2474.42] - - [1024, 3584, 1, 3328] - - [243, 9258.58] + - [377, 9258.58] - - [4, 2944, 1, 1280] - - [195, 611.84] + - [329, 611.84] - - [1408, 64, 1, 128] - - [166, 858.31] + - [300, 858.31] - - [256, 4288, 1, 3328] - - [246, 7616.08] + - [380, 7616.08] - - [5888, 1408, 1, 1280] - - [239, 9620.39] + - [373, 9620.39] - - [704, 1856, 1, 3328] - - [240, 9033.75] + - [374, 9033.75] - - [4, 1408, 1, 128] - - [253, 24.455] + - [387, 24.455] - - [1024, 2368, 1, 256] - - [240, 7526.25] + - [374, 7526.25] - - [1408, 1856, 1, 1280] - - [243, 8324.19] + - [377, 8324.19] - - [1408, 64, 1, 1280] - - [214, 4681.24] + - [348, 4681.24] - - [448, 1024, 1, 1280] - - [240, 7112.53] + - [374, 7112.53] - - [256, 1408, 1, 3328] - - [246, 5825.51] + - [380, 5825.51] - - [5056, 5056, 1, 1280] - - [249, 9233.65] + - [383, 9233.65] - - [448, 5056, 1, 256] - - [241, 7003.27] + - [375, 7003.27] - - [704, 1856, 1, 1280] - - [240, 8877.38] + - [374, 8877.38] - - [128, 5056, 1, 128] - - [232, 2301.14] + - [366, 2301.14] - - [2368, 128, 1, 256] - - [240, 3849.04] + - [374, 3849.04] - - [1856, 1408, 1, 128] - - [235, 4202.31] + - [369, 4202.31] - - [64, 5056, 1, 256] - - [241, 3109.62] + - [375, 3109.62] - - [6784, 256, 1, 3328] - - [240, 6388.53] + - [374, 6388.53] - - [6784, 4288, 1, 3328] - - [251, 9114.67] + - [385, 9114.67] - - [4288, 448, 1, 256] - - [244, 5783.05] + - [378, 5783.05] - - [64, 704, 1, 128] - - [177, 379.519] + - [311, 379.519] - - [1856, 2368, 1, 3328] - - [240, 9128.46] + - [374, 9128.46] - - [4288, 2944, 1, 1280] - - [246, 9182.33] + - [380, 9182.33] - - [704, 5056, 1, 1280] - - [240, 9071.57] + - [374, 9071.57] - - [2368, 704, 1, 3328] - - [246, 7731.43] + - [380, 7731.43] - - [256, 5888, 1, 256] - - [240, 7920.38] + - [374, 7920.38] - - [1856, 4288, 1, 3328] - - [246, 9330.07] + - [380, 9330.07] - - [256, 2944, 1, 256] - - [247, 5312.27] + - [381, 5312.27] - - [5888, 1024, 1, 256] - - [238, 6710.97] + - [372, 6710.97] - - [448, 64, 1, 1280] - - [213, 2814.53] + - [347, 2814.53] - - [448, 5056, 1, 3328] - - [240, 8255.53] + - [374, 8255.53] - - [3584, 4, 1, 1280] - - [189, 640.815] + - [323, 640.815] - - [2944, 64, 1, 256] - - [188, 2621.54] + - [322, 2621.54] - - [128, 4, 1, 1280] - - [260, 86.3316] + - [394, 86.3316] - - [1408, 2944, 1, 256] - - [240, 8848.99] + - [374, 8848.99] - - [256, 1856, 1, 1280] - - [240, 7366.55] + - [374, 7366.55] - - [6784, 5056, 1, 3328] - - [251, 8332.16] + - [385, 8332.16] - - [5056, 5056, 1, 256] - - [246, 9171.74] + - [380, 9171.74] - - [1408, 6784, 1, 128] - - [232, 5079.19] + - [366, 5079.19] - - [64, 1024, 1, 1280] - - [204, 3679.31] + - [338, 3679.31] - - [2944, 4, 1, 256] - - [195, 369.543] + - [329, 369.543] - - [704, 5056, 1, 128] - - [232, 4509.27] + - [366, 4509.27] - - [4, 2368, 1, 1280] - - [189, 569.844] + - [323, 569.844] - - [2368, 2944, 1, 1280] - - [251, 7451.14] + - [385, 7451.14] - - [128, 3584, 1, 1280] - - [249, 6071.26] + - [383, 6071.26] - - [6784, 6784, 1, 1280] - - [246, 9535.74] - - - [1024, 256, 1, 3328] - - [240, 5742.68] + - [380, 9535.74] - - [1408, 4288, 1, 1280] - - [249, 8255.09] + - [383, 8255.09] - - [3584, 4288, 1, 1280] - - [251, 9651.19] + - [385, 9651.19] - - [2368, 704, 1, 1280] - - [246, 8291.4] + - [380, 8291.4] - - [5056, 4288, 1, 3328] - - [238, 9406.36] + - [372, 9406.36] - - [3584, 2368, 1, 3328] - - [246, 9350.32] + - [380, 9350.32] - - [64, 704, 1, 1280] - - [213, 3384.59] + - [347, 3384.59] - - [4288, 256, 1, 256] - - [246, 5593.62] + - [380, 5593.62] - - [2944, 128, 1, 128] - - [168, 2130.6] + - [302, 2130.6] - - [6784, 448, 1, 1280] - - [249, 8815.85] + - [383, 8815.85] - - [1408, 2944, 1, 128] - - [232, 4558.34] + - [366, 4558.34] - - [4288, 2944, 1, 256] - - [251, 7865.43] + - [385, 7865.43] - - [5888, 704, 1, 1280] - - [240, 9262.99] + - [374, 9262.99] - - [1856, 64, 1, 1280] - - [214, 4359.15] + - [348, 4359.15] - - [448, 5888, 1, 128] - - [235, 4000.59] + - [369, 4000.59] - - [5888, 64, 1, 3328] - - [215, 6603.39] + - [349, 6603.39] - - [2944, 256, 1, 3328] - - [240, 8423.63] + - [374, 8423.63] - - [1024, 64, 1, 128] - - [185, 582.642] + - [319, 582.642] - - [5056, 2368, 1, 1280] - - [240, 9419.91] + - [374, 9419.91] - - [448, 3584, 1, 1280] - - [240, 7985.82] + - [374, 7985.82] - - [6784, 5888, 1, 256] - - [238, 9494.36] + - [372, 9494.36] - - [704, 1024, 1, 128] - - [232, 2813.35] + - [366, 2813.35] - - [704, 128, 1, 1280] - - [214, 4477.71] + - [348, 4477.71] - - [5888, 2944, 1, 128] - - [235, 4745.96] + - [369, 4745.96] - - [4, 3584, 1, 128] - - [252, 96.479] + - [386, 96.479] - - [1408, 448, 1, 1280] - - [240, 6912.8] + - [374, 6912.8] - - [1024, 1408, 1, 256] - - [248, 5810.85] + - [382, 5810.85] - - [2368, 2368, 1, 3328] - - [249, 9088.71] + - [383, 9088.71] - - [1856, 6784, 1, 128] - - [235, 5168.32] + - [369, 5168.32] - - [5056, 704, 1, 3328] - - [241, 7464.9] + - [375, 7464.9] - - [1408, 1856, 1, 256] - - [246, 6727.69] + - [380, 6727.69] - - [1408, 704, 1, 3328] - - [246, 8379.53] + - [380, 8379.53] - - [2368, 5056, 1, 256] - - [246, 8664.11] + - [380, 8664.11] - - [5888, 1856, 1, 256] - - [251, 5810.02] + - [385, 5810.02] - - [4288, 64, 1, 3328] - - [228, 6583.94] + - [362, 6583.94] - - [2368, 4, 1, 1280] - - [261, 545.251] + - [395, 545.251] - - [704, 5888, 1, 256] - - [246, 8813.71] + - [380, 8813.71] - - [4288, 64, 1, 256] - - [204, 3059.97] + - [338, 3059.97] - - [6784, 64, 1, 256] - - [246, 3490.96] + - [380, 3490.96] - - [2944, 256, 1, 256] - - [240, 6970.4] + - [374, 6970.4] - - [2944, 6784, 1, 3328] - - [240, 9475.79] + - [374, 9475.79] - - [704, 1408, 1, 3328] - - [240, 8154.18] + - [374, 8154.18] - - [3584, 704, 1, 3328] - - [240, 8995.07] + - [374, 8995.07] - - [2944, 256, 1, 128] - - [232, 2824.13] + - [366, 2824.13] - - [6784, 4, 1, 1280] - - [189, 625.714] + - [323, 625.714] - - [1024, 64, 1, 1280] - - [201, 3307.91] + - [335, 3307.91] - - [448, 4288, 1, 256] - - [246, 6074.48] + - [380, 6074.48] - - [64, 3584, 1, 3328] - - [194, 6200.26] + - [328, 6200.26] - - [704, 2368, 1, 1280] - - [240, 8291.4] + - [374, 8291.4] - - [448, 2944, 1, 128] - - [232, 3221.87] + - [366, 3221.87] - - [1856, 2368, 1, 1280] - - [251, 6855.24] + - [385, 6855.24] - - [2368, 128, 1, 3328] - - [202, 6479.61] + - [336, 6479.61] - - [2944, 128, 1, 256] - - [240, 3828.23] + - [374, 3828.23] - - [448, 1408, 1, 256] - - [241, 4525.9] + - [375, 4525.9] - - [1856, 4288, 1, 1280] - - [239, 9160.32] + - [373, 9160.32] - - [64, 5056, 1, 3328] - - [222, 6819.3] + - [356, 6819.3] - - [4, 704, 1, 256] - - [206, 123.541] + - [340, 123.541] - - [1024, 448, 1, 128] - - [235, 1989.27] + - [369, 1989.27] - - [704, 4, 1, 1280] - - [209, 381.931] + - [343, 381.931] - - [704, 256, 1, 128] - - [232, 1109.17] + - [366, 1109.17] - - [704, 2944, 1, 128] - - [232, 4089.03] + - [366, 4089.03] - - [1408, 1024, 1, 1280] - - [246, 8192.08] + - [380, 8192.08] - - [704, 6784, 1, 256] - - [240, 6717.9] + - [374, 6717.9] - - [6784, 704, 1, 256] - - [246, 5429.22] + - [380, 5429.22] - - [5056, 1408, 1, 128] - - [232, 4954.5] + - [366, 4954.5] - - [256, 3584, 1, 3328] - - [240, 7890.96] + - [374, 7890.96] - - [4, 5888, 1, 3328] - - [257, 691.047] + - [391, 691.047] - - [128, 1408, 1, 128] - - [179, 1393.14] + - [313, 1393.14] - - [3584, 4288, 1, 3328] - - [242, 8900.87] + - [376, 8900.87] - - [5888, 1856, 1, 1280] - - [243, 9345.85] + - [377, 9345.85] - - [5056, 1024, 1, 3328] - - [244, 7834.84] + - [378, 7834.84] - - [5056, 64, 1, 1280] - - [222, 5890.14] + - [356, 5890.14] - - [1024, 704, 1, 256] - - [240, 6007.57] + - [374, 6007.57] - - [1024, 4288, 1, 128] - - [234, 3497.09] + - [368, 3497.09] - - [4288, 64, 1, 1280] - - [219, 4726.59] + - [353, 4726.59] - - [2368, 3584, 1, 1280] - - [238, 8128.82] + - [372, 8128.82] - - [2368, 6784, 1, 1280] - - [238, 9478.72] + - [372, 9478.72] - - [1024, 256, 1, 256] - - [246, 4092.1] + - [380, 4092.1] - - [1856, 4, 1, 1280] - - [261, 509.903] + - [395, 509.903] - - [448, 448, 1, 256] - - [246, 3001.28] + - [380, 3001.28] - - [2944, 3584, 1, 3328] - - [247, 9081.91] + - [381, 9081.91] - - [128, 4288, 1, 128] - - [167, 2323.33] + - [301, 2323.33] - - [64, 448, 1, 256] - - [210, 1066.97] + - [344, 1066.97] - - [128, 1024, 1, 3328] - - [223, 6392.36] + - [357, 6392.36] - - [4, 1408, 1, 3328] - - [206, 616.656] + - [340, 616.656] - - [6784, 2944, 1, 256] - - [249, 8547.73] + - [383, 8547.73] - - [64, 1856, 1, 1280] - - [222, 4409.71] + - [356, 4409.71] - - [64, 1024, 1, 128] - - [166, 554.902] + - [300, 554.902] - - [4288, 2368, 1, 3328] - - [242, 8780.08] + - [376, 8780.08] - - [1856, 2368, 1, 256] - - [249, 4976.74] + - [383, 4976.74] - - [3584, 256, 1, 128] - - [234, 2812.37] + - [368, 2812.37] - - [3584, 6784, 1, 3328] - - [244, 9278.22] + - [378, 9278.22] - - [256, 1024, 1, 256] - - [240, 4346.53] + - [374, 4346.53] - - [4, 6784, 1, 3328] - - [259, 681.366] + - [393, 681.366] - - [1024, 5888, 1, 3328] - - [240, 9187.61] + - [374, 9187.61] - - [1024, 128, 1, 1280] - - [192, 3660.05] + - [326, 3660.05] - - [4288, 128, 1, 1280] - - [246, 6019.17] + - [380, 6019.17] - - [5056, 4288, 1, 1280] - - [238, 9343.96] + - [372, 9343.96] - - [5888, 64, 1, 256] - - [240, 4692.17] + - [374, 4692.17] - - [1856, 256, 1, 1280] - - [246, 4790.38] + - [380, 4790.38] - - [64, 5888, 1, 3328] - - [214, 6702.2] + - [348, 6702.2] - - [2944, 5888, 1, 128] - - [235, 5202.65] + - [369, 5202.65] - - [704, 5888, 1, 1280] - - [240, 9264.29] + - [374, 9264.29] - - [2368, 3584, 1, 128] - - [232, 5053.71] + - [366, 5053.71] - - [6784, 5888, 1, 3328] - - [238, 7926.8] + - [372, 7926.8] - - [704, 1024, 1, 1280] - - [239, 5402.6] + - [373, 5402.6] - - [448, 256, 1, 3328] - - [222, 6124.65] + - [356, 6124.65] - - [448, 1856, 1, 128] - - [233, 2885.96] + - [367, 2885.96] - - [128, 1024, 1, 128] - - [167, 1013.22] + - [301, 1013.22] - - [2944, 4, 1, 128] - - [252, 77.6374] + - [386, 77.6374] - - [1024, 704, 1, 1280] - - [240, 7365.58] + - [374, 7365.58] - - [128, 5888, 1, 256] - - [240, 6990.61] + - [374, 6990.61] - - [1024, 5056, 1, 1280] - - [245, 9422.0] + - [379, 9422.0] - - [4288, 1024, 1, 256] - - [247, 6270.03] + - [381, 6270.03] - - [2944, 2368, 1, 128] - - [232, 4918.18] + - [366, 4918.18] - - [704, 704, 1, 3328] - - [240, 7963.65] + - [374, 7963.65] - - [704, 1408, 1, 1280] - - [240, 8347.32] + - [374, 8347.32] - - [5888, 448, 1, 1280] - - [246, 5217.05] + - [380, 5217.05] - - [3584, 256, 1, 3328] - - [240, 7802.25] + - [374, 7802.25] - - [704, 5888, 1, 3328] - - [246, 8381.46] + - [380, 8381.46] - - [704, 1856, 1, 128] - - [232, 3598.38] + - [366, 3598.38] - - [128, 3584, 1, 3328] - - [202, 7161.11] + - [336, 7161.11] - - [6784, 2368, 1, 1280] - - [251, 9464.41] + - [385, 9464.41] - - [4, 4288, 1, 128] - - [252, 132.68] + - [386, 132.68] - - [128, 704, 1, 1280] - - [214, 4463.85] + - [348, 4463.85] - - [3584, 2944, 1, 256] - - [251, 8201.24] + - [385, 8201.24] - - [1856, 128, 1, 3328] - - [193, 6575.5] + - [327, 6575.5] - - [4, 64, 1, 1280] - - [209, 43.6745] + - [343, 43.6745] - - [4, 5056, 1, 3328] - - [189, 675.315] + - [323, 675.315] - - [128, 2944, 1, 1280] - - [193, 5916.99] + - [327, 5916.99] - - [2368, 1024, 1, 3328] - - [246, 8646.84] + - [380, 8646.84] - - [128, 256, 1, 3328] - - [227, 4130.85] + - [361, 4130.85] - - [1408, 5056, 1, 3328] - - [245, 9529.75] + - [379, 9529.75] - - [1856, 1856, 1, 3328] - - [244, 8114.99] + - [378, 8114.99] - - [3584, 128, 1, 256] - - [240, 5603.18] + - [374, 5603.18] - - [448, 1408, 1, 3328] - - [240, 7073.03] + - [374, 7073.03] - - [2368, 2368, 1, 256] - - [247, 7648.76] + - [381, 7648.76] - - [4288, 4288, 1, 1280] - - [242, 9244.11] + - [376, 9244.11] - - [64, 448, 1, 1280] - - [213, 2885.33] + - [347, 2885.33] - - [1408, 4288, 1, 256] - - [240, 8080.41] + - [374, 8080.41] - - [448, 4, 1, 256] - - [258, 84.4294] + - [392, 84.4294] - - [5888, 448, 1, 128] - - [235, 3540.8] + - [369, 3540.8] - - [448, 4, 1, 1280] - - [209, 322.257] + - [343, 322.257] - - [704, 6784, 1, 3328] - - [239, 8613.58] + - [373, 8613.58] - - [5888, 5888, 1, 1280] - - [246, 9502.05] + - [380, 9502.05] - - [5056, 1024, 1, 1280] - - [249, 9110.11] + - [383, 9110.11] - - [448, 5888, 1, 3328] - - [240, 8586.43] + - [374, 8586.43] - - [128, 4, 1, 128] - - [252, 4.27959] + - [386, 4.27959] - - [1024, 2944, 1, 1280] - - [248, 7096.53] + - [382, 7096.53] - - [5056, 5888, 1, 1280] - - [239, 9693.51] + - [373, 9693.51] - - [4288, 5888, 1, 128] - - [232, 5406.46] + - [366, 5406.46] - - [256, 3584, 1, 256] - - [240, 6908.37] + - [374, 6908.37] - - [1408, 3584, 1, 128] - - [232, 4645.69] + - [366, 4645.69] - - [256, 2944, 1, 3328] - - [243, 6284.4] + - [377, 6284.4] - - [448, 3584, 1, 128] - - [235, 3675.37] + - [369, 3675.37] - - [5888, 2944, 1, 1280] - - [245, 9628.9] + - [379, 9628.9] - - [4, 6784, 1, 1280] - - [189, 688.176] + - [323, 688.176] - - [2368, 5888, 1, 128] - - [232, 5273.96] + - [366, 5273.96] - - [64, 2944, 1, 128] - - [176, 1316.54] + - [310, 1316.54] - - [3584, 5888, 1, 256] - - [246, 9239.14] + - [380, 9239.14] - - [2368, 704, 1, 128] - - [235, 3537.65] + - [369, 3537.65] - - [3584, 2944, 1, 1280] - - [240, 9324.62] + - [374, 9324.62] - - [3584, 2368, 1, 128] - - [232, 4766.34] + - [366, 4766.34] - - [5056, 704, 1, 128] - - [232, 4487.95] + - [366, 4487.95] - - [448, 2368, 1, 128] - - [235, 2877.02] + - [369, 2877.02] - - [5056, 1408, 1, 3328] - - [251, 9515.97] + - [385, 9515.97] - - [1408, 704, 1, 256] - - [243, 6836.18] + - [377, 6836.18] - - [6784, 1024, 1, 3328] - - [238, 9309.65] + - [372, 9309.65] - - [6784, 2944, 1, 3328] - - [239, 9536.58] + - [373, 9536.58] - - [2944, 5056, 1, 3328] - - [240, 9526.25] + - [374, 9526.25] - - [1856, 1856, 1, 256] - - [240, 5239.24] + - [374, 5239.24] - - [1024, 5888, 1, 128] - - [232, 4006.28] + - [366, 4006.28] - - [2048, 7133, 1, 2048] - - [238, 9828.07] + - [372, 9828.07] - - [256, 4, 1, 128] - - [253, 4.38908] + - [387, 4.38908] - - [4288, 5888, 1, 1280] - - [248, 9202.83] + - [382, 9202.83] - - [4288, 4288, 1, 256] - - [243, 5521.18] + - [377, 5521.18] - - [448, 2944, 1, 3328] - - [246, 7724.53] + - [380, 7724.53] - - [4288, 1856, 1, 1280] - - [246, 8826.34] + - [380, 8826.34] - - [1856, 2944, 1, 3328] - - [240, 9194.9] + - [374, 9194.9] - - [256, 6784, 1, 3328] - - [240, 8740.33] + - [374, 8740.33] - - [64, 5888, 1, 256] - - [240, 4766.35] + - [374, 4766.35] - - [256, 5056, 1, 128] - - [232, 2937.6] + - [366, 2937.6] - - [5056, 1024, 1, 256] - - [251, 5467.91] + - [385, 5467.91] - - [704, 64, 1, 3328] - - [228, 4818.43] + - [362, 4818.43] - - [5056, 1856, 1, 3328] - - [245, 8861.69] + - [379, 8861.69] - - [4, 2944, 1, 3328] - - [195, 662.102] + - [329, 662.102] - - [4, 5056, 1, 256] - - [255, 494.121] + - [389, 494.121] - - [1856, 1408, 1, 256] - - [240, 8674.78] + - [374, 8674.78] - - [3584, 4, 1, 128] - - [252, 108.296] + - [386, 108.296] - - [448, 448, 1, 3328] - - [214, 6457.4] + - [348, 6457.4] - - [6784, 128, 1, 3328] - - [207, 7256.71] + - [341, 7256.71] - - [4288, 1408, 1, 128] - - [235, 4791.76] + - [369, 4791.76] - - [4288, 5056, 1, 256] - - [240, 8560.84] + - [374, 8560.84] - - [1408, 128, 1, 1280] - - [222, 5085.79] + - [356, 5085.79] - - [5056, 256, 1, 3328] - - [243, 7284.23] + - [377, 7284.23] - - [704, 704, 1, 256] - - [240, 6171.19] + - [374, 6171.19] - - [1024, 5888, 1, 1280] - - [245, 8852.89] + - [379, 8852.89] - - [6784, 2368, 1, 128] - - [233, 4729.3] + - [367, 4729.3] - - [4, 5056, 1, 1280] - - [206, 670.046] + - [340, 670.046] - - [64, 128, 1, 256] - - [208, 369.317] + - [342, 369.317] - - [128, 1856, 1, 1280] - - [202, 5549.13] + - [336, 5549.13] - - [5056, 3584, 1, 256] - - [246, 7115.84] + - [380, 7115.84] - - [1856, 1024, 1, 1280] - - [238, 8196.5] + - [372, 8196.5] - - [6784, 4288, 1, 1280] - - [239, 9509.66] + - [373, 9509.66] - - [1856, 1856, 1, 1280] - - [241, 5791.99] + - [375, 5791.99] - - [6784, 2944, 1, 128] - - [232, 5317.12] + - [366, 5317.12] - - [1408, 5056, 1, 1280] - - [241, 8980.73] + - [375, 8980.73] - - [4, 2368, 1, 3328] - - [206, 592.634] + - [340, 592.634] - - [5888, 1856, 1, 128] - - [231, 4600.2] + - [365, 4600.2] - - [448, 704, 1, 1280] - - [240, 2286.58] + - [374, 2286.58] - - [2368, 1024, 1, 128] - - [235, 3911.12] + - [369, 3911.12] - - [1024, 448, 1, 3328] - - [240, 7295.24] + - [374, 7295.24] - - [1856, 704, 1, 1280] - - [240, 8881.12] + - [374, 8881.12] - - [5056, 3584, 1, 128] - - [232, 4911.68] + - [366, 4911.68] - - [5888, 5888, 1, 3328] - - [248, 9243.9] + - [382, 9243.9] - - [6784, 1024, 1, 256] - - [251, 5475.41] + - [385, 5475.41] - - [2944, 2368, 1, 256] - - [246, 5670.77] + - [380, 5670.77] - - [256, 448, 1, 256] - - [197, 2293.86] + - [331, 2293.86] - - [5056, 5888, 1, 3328] - - [241, 7848.07] + - [375, 7848.07] - - [1856, 1024, 1, 256] - - [246, 7517.7] + - [380, 7517.7] - - [448, 1408, 1, 1280] - - [240, 6917.54] + - [374, 6917.54] - - [3584, 448, 1, 1280] - - [246, 7980.86] + - [380, 7980.86] - - [1024, 1024, 1, 1280] - - [243, 8384.52] + - [377, 8384.52] - - [448, 5888, 1, 256] - - [240, 7365.75] + - [374, 7365.75] - - [704, 64, 1, 128] - - [185, 358.755] + - [319, 358.755] - - [1408, 6784, 1, 3328] - - [246, 9094.19] + - [380, 9094.19] - - [448, 1024, 1, 128] - - [235, 1773.05] + - [369, 1773.05] - - [4288, 704, 1, 128] - - [232, 4355.38] + - [366, 4355.38] - - [128, 1856, 1, 128] - - [171, 1610.73] + - [305, 1610.73] - - [448, 2368, 1, 3328] - - [246, 7366.47] + - [380, 7366.47] - - [5056, 64, 1, 128] - - [171, 2157.33] + - [305, 2157.33] - - [5056, 2944, 1, 256] - - [240, 9123.16] + - [374, 9123.16] - - [6784, 5888, 1, 128] - - [231, 5285.9] + - [365, 5285.9] - - [704, 1024, 1, 256] - - [246, 6667.35] + - [380, 6667.35] - - [1024, 4, 1, 256] - - [195, 187.346] + - [329, 187.346] - - [2368, 1856, 1, 256] - - [246, 6777.94] + - [380, 6777.94] - - [128, 6784, 1, 1280] - - [243, 7052.71] + - [377, 7052.71] - - [1408, 3584, 1, 3328] - - [247, 9038.05] + - [381, 9038.05] - - [2368, 6784, 1, 256] - - [240, 9181.45] + - [374, 9181.45] - - [5056, 1408, 1, 1280] - - [245, 9422.0] + - [379, 9422.0] - - [256, 256, 1, 128] - - [177, 543.404] + - [311, 543.404] - - [5056, 4288, 1, 128] - - [235, 5340.02] + - [369, 5340.02] - - [1408, 1856, 1, 128] - - [232, 4270.99] + - [366, 4270.99] - - [1408, 5888, 1, 3328] - - [244, 9034.89] + - [378, 9034.89] - - [1856, 256, 1, 256] - - [246, 5847.93] + - [380, 5847.93] - - [6784, 6784, 1, 256] - - [239, 9624.48] + - [373, 9624.48] - - [64, 256, 1, 128] - - [178, 146.549] + - [312, 146.549] - - [4288, 2368, 1, 128] - - [231, 3897.04] + - [365, 3897.04] - - [1856, 4288, 1, 128] - - [232, 4337.17] + - [366, 4337.17] - - [256, 4288, 1, 1280] - - [240, 7499.52] + - [374, 7499.52] - - [2368, 2944, 1, 256] - - [245, 7703.28] + - [379, 7703.28] - - [4, 1856, 1, 256] - - [258, 264.064] + - [392, 264.064] - - [3584, 1856, 1, 1280] - - [240, 9224.43] + - [374, 9224.43] - - [6784, 6784, 1, 128] - - [232, 5476.13] + - [366, 5476.13] - - [256, 1856, 1, 128] - - [235, 1858.82] + - [369, 1858.82] - - [704, 64, 1, 1280] - - [213, 3368.77] + - [347, 3368.77] - - [5888, 5056, 1, 256] - - [246, 5859.91] + - [380, 5859.91] - - [3584, 448, 1, 256] - - [246, 7298.43] + - [380, 7298.43] - - [448, 4288, 1, 128] - - [232, 3813.55] + - [366, 3813.55] - - [2944, 4288, 1, 3328] - - [241, 9149.73] + - [375, 9149.73] - - [256, 6784, 1, 256] - - [240, 7984.95] + - [374, 7984.95] - - [1408, 4288, 1, 128] - - [235, 4728.44] + - [369, 4728.44] - - [2944, 704, 1, 3328] - - [246, 7149.86] + - [380, 7149.86] - - [128, 448, 1, 256] - - [212, 1699.18] + - [346, 1699.18] - - [512, 32, 1, 512] - - [212, 1127.6] + - [346, 1127.6] - - [3584, 3584, 1, 256] - - [241, 8558.11] + - [375, 8558.11] - - [448, 1408, 1, 128] - - [232, 2504.45] + - [366, 2504.45] - - [128, 256, 1, 1280] - - [213, 3216.59] + - [347, 3216.59] - - [3584, 5056, 1, 256] - - [238, 5674.45] + - [372, 5674.45] - - [6784, 128, 1, 256] - - [240, 6216.49] + - [374, 6216.49] - - [4288, 4, 1, 256] - - [256, 435.706] + - [390, 435.706] - - [64, 1408, 1, 3328] - - [214, 6186.01] + - [348, 6186.01] - - [704, 448, 1, 256] - - [246, 4005.08] + - [380, 4005.08] - - [2944, 2368, 1, 1280] - - [247, 8542.8] + - [381, 8542.8] - - [448, 64, 1, 3328] - - [227, 3835.33] + - [361, 3835.33] - - [1408, 3584, 1, 256] - - [240, 8714.63] + - [374, 8714.63] - - [3584, 4, 1, 3328] - - [195, 689.554] + - [329, 689.554] - - [6784, 3584, 1, 256] - - [245, 9271.34] + - [379, 9271.34] - - [256, 128, 1, 128] - - [178, 283.499] + - [312, 283.499] - - [704, 1408, 1, 128] - - [232, 3210.57] + - [366, 3210.57] - - [4, 2368, 1, 256] - - [258, 360.938] + - [392, 360.938] - - [2944, 448, 1, 128] - - [232, 3344.41] + - [366, 3344.41] - - [128, 1408, 1, 256] - - [240, 3186.38] + - [374, 3186.38] - - [4, 2944, 1, 256] - - [256, 384.622] + - [390, 384.622] - - [64, 128, 1, 3328] - - [209, 2103.72] + - [343, 2103.72] - - [5056, 2368, 1, 128] - - [232, 5219.76] + - [366, 5219.76] - - [2944, 2944, 1, 3328] - - [249, 9174.69] + - [383, 9174.69] - - [5056, 6784, 1, 256] - - [251, 8992.36] + - [385, 8992.36] - - [1856, 3584, 1, 128] - - [232, 4957.27] + - [366, 4957.27] - - [128, 2944, 1, 128] - - [170, 2241.48] + - [304, 2241.48] - - [1024, 704, 1, 3328] - - [250, 6545.11] + - [384, 6545.11] - - [6784, 448, 1, 256] - - [246, 5379.25] + - [380, 5379.25] - - [3584, 6784, 1, 128] - - [232, 5102.01] + - [366, 5102.01] - - [128, 4288, 1, 256] - - [240, 5211.86] + - [374, 5211.86] - - [704, 448, 1, 3328] - - [241, 4504.15] + - [375, 4504.15] - - [1024, 1024, 1, 3328] - - [243, 8009.77] + - [377, 8009.77] - - [128, 128, 1, 3328] - - [226, 3185.03] + - [360, 3185.03] - - [5056, 1856, 1, 256] - - [240, 9138.43] + - [374, 9138.43] - - [256, 128, 1, 256] - - [212, 1205.36] + - [346, 1205.36] - - [1024, 1856, 1, 256] - - [251, 6375.09] + - [385, 6375.09] - - [4288, 64, 1, 128] - - [168, 1695.43] + - [302, 1695.43] - - [256, 448, 1, 3328] - - [215, 5659.67] + - [349, 5659.67] - - [1408, 6784, 1, 1280] - - [240, 9349.2] + - [374, 9349.2] - - [3584, 3584, 1, 1280] - - [245, 9302.19] + - [379, 9302.19] - - [64, 2368, 1, 1280] - - [214, 4433.07] + - [348, 4433.07] - - [448, 2368, 1, 1280] - - [240, 7250.77] + - [374, 7250.77] - - [5888, 5888, 1, 128] - - [232, 4616.03] + - [366, 4616.03] - - [64, 6784, 1, 3328] - - [246, 6987.23] + - [380, 6987.23] - - [2944, 256, 1, 1280] - - [249, 6127.45] + - [383, 6127.45] - - [5056, 5888, 1, 128] - - [231, 5106.39] + - [365, 5106.39] - - [256, 2368, 1, 128] - - [232, 2141.23] + - [366, 2141.23] - - [5056, 2368, 1, 3328] - - [243, 9041.75] + - [377, 9041.75] - - [2944, 4288, 1, 256] - - [251, 8691.22] + - [385, 8691.22] - - [1408, 3584, 1, 1280] - - [240, 9070.0] + - [374, 9070.0] - - [2368, 64, 1, 256] - - [212, 2412.87] + - [346, 2412.87] - - [64, 448, 1, 3328] - - [227, 3739.14] + - [361, 3739.14] - - [256, 256, 1, 3328] - - [214, 5304.18] + - [348, 5304.18] - - [5888, 4, 1, 128] - - [253, 105.655] + - [387, 105.655] - - [1856, 704, 1, 256] - - [240, 8025.43] + - [374, 8025.43] - - [4, 4288, 1, 1280] - - [187, 579.07] + - [321, 579.07] - - [1408, 448, 1, 3328] - - [248, 5714.51] + - [382, 5714.51] - - [1024, 4, 1, 3328] - - [206, 608.649] + - [340, 608.649] - - [2368, 256, 1, 256] - - [246, 5173.08] + - [380, 5173.08] - - [2368, 6784, 1, 3328] - - [246, 9456.61] + - [380, 9456.61] - - [1856, 1408, 1, 1280] - - [251, 7805.19] + - [385, 7805.19] - - [1856, 448, 1, 1280] - - [238, 6185.04] + - [372, 6185.04] - - [6784, 704, 1, 128] - - [232, 4597.87] + - [366, 4597.87] - - [4, 4, 1, 256] - - [209, 0.791892] + - [343, 0.791892] - - [128, 5888, 1, 128] - - [170, 2691.76] + - [304, 2691.76] - - [1408, 5888, 1, 256] - - [245, 7164.27] + - [379, 7164.27] - - [704, 2944, 1, 1280] - - [247, 8139.81] + - [381, 8139.81] - - [1856, 2368, 1, 128] - - [235, 4623.38] + - [369, 4623.38] - - [4096, 7133, 1, 4096] - - [239, 9940.07] + - [373, 9940.07] - - [256, 64, 1, 256] - - [203, 689.953] + - [337, 689.953] - - [1024, 1024, 1, 256] - - [246, 7216.11] + - [380, 7216.11] - - [704, 1856, 1, 256] - - [246, 6364.17] + - [380, 6364.17] - - [128, 4288, 1, 3328] - - [202, 7200.59] + - [336, 7200.59] - - [3584, 704, 1, 1280] - - [249, 7972.08] + - [383, 7972.08] - - [256, 128, 1, 1280] - - [200, 2702.62] + - [334, 2702.62] - - [2368, 4, 1, 256] - - [195, 326.018] + - [329, 326.018] - - [256, 2368, 1, 1280] - - [240, 6638.93] + - [374, 6638.93] - - [2944, 6784, 1, 128] - - [231, 5233.53] + - [365, 5233.53] - - [3584, 448, 1, 3328] - - [240, 8094.4] + - [374, 8094.4] - - [1408, 4, 1, 256] - - [258, 243.646] + - [392, 243.646] - - [704, 2368, 1, 3328] - - [240, 8403.11] + - [374, 8403.11] - - [2944, 448, 1, 256] - - [240, 7022.59] + - [374, 7022.59] - - [1856, 448, 1, 128] - - [235, 2842.79] + - [369, 2842.79] - - [2368, 128, 1, 1280] - - [222, 5685.52] + - [356, 5685.52] - - [256, 5888, 1, 128] - - [237, 2178.71] + - [371, 2178.71] - - [64, 6784, 1, 256] - - [240, 5385.23] + - [374, 5385.23] - - [64, 5056, 1, 1280] - - [214, 5603.29] + - [348, 5603.29] - - [4, 6784, 1, 128] - - [252, 180.256] + - [386, 180.256] - - [2944, 2944, 1, 1280] - - [249, 9129.39] + - [383, 9129.39] - - [5888, 2368, 1, 256] - - [251, 6961.69] + - [385, 6961.69] - - [4, 3584, 1, 1280] - - [195, 646.23] + - [329, 646.23] - - [1408, 128, 1, 128] - - [181, 1172.29] + - [315, 1172.29] - - [6784, 704, 1, 3328] - - [246, 9084.62] + - [380, 9084.62] - - [128, 64, 1, 1280] - - [225, 1260.41] + - [359, 1260.41] - - [2368, 256, 1, 1280] - - [246, 6643.48] + - [380, 6643.48] - - [4, 448, 1, 3328] - - [209, 433.514] + - [343, 433.514] - - [5888, 4288, 1, 128] - - [233, 4753.17] + - [367, 4753.17] - - [4, 5888, 1, 256] - - [195, 471.14] + - [329, 471.14] - - [1408, 2944, 1, 3328] - - [249, 9207.1] + - [383, 9207.1] - - [3584, 704, 1, 128] - - [235, 3762.46] + - [369, 3762.46] - - [64, 1024, 1, 256] - - [213, 1807.99] + - [347, 1807.99] - - [5056, 5056, 1, 128] - - [236, 4830.16] + - [370, 4830.16] - - [2368, 448, 1, 1280] - - [240, 7263.16] + - [374, 7263.16] - - [128, 3584, 1, 256] - - [243, 4369.17] + - [377, 4369.17] - - [704, 448, 1, 1280] - - [241, 4205.33] + - [375, 4205.33] - - [448, 5056, 1, 128] - - [232, 3855.57] + - [366, 3855.57] - - [256, 4, 1, 1280] - - [263, 157.638] + - [397, 157.638] - - [128, 5056, 1, 256] - - [246, 6109.06] + - [380, 6109.06] - - [1408, 5056, 1, 128] - - [235, 4836.68] + - [369, 4836.68] - - [2944, 3584, 1, 128] - - [235, 4532.19] + - [369, 4532.19] - - [3584, 2368, 1, 256] - - [240, 8951.34] + - [374, 8951.34] - - [5888, 5056, 1, 1280] - - [251, 9276.49] + - [385, 9276.49] - - [2368, 5056, 1, 128] - - [235, 5167.66] + - [369, 5167.66] - - [64, 704, 1, 256] - - [195, 1501.97] + - [329, 1501.97] - - [4288, 256, 1, 1280] - - [240, 7496.3] + - [374, 7496.3] - - [3584, 3584, 1, 3328] - - [241, 9301.77] + - [375, 9301.77] - - [1024, 256, 1, 128] - - [232, 1508.84] + - [366, 1508.84] - - [4, 704, 1, 128] - - [253, 12.1469] + - [387, 12.1469] - - [5888, 6784, 1, 256] - - [239, 9370.47] + - [373, 9370.47] - - [4288, 2944, 1, 3328] - - [243, 9149.09] + - [377, 9149.09] - - [2944, 64, 1, 128] - - [179, 1456.46] + - [313, 1456.46] - - [1856, 64, 1, 256] - - [205, 2210.03] + - [339, 2210.03] - - [4288, 128, 1, 3328] - - [199, 6471.95] + - [333, 6471.95] - - [4288, 704, 1, 1280] - - [246, 8934.61] + - [380, 8934.61] - - [256, 5056, 1, 1280] - - [240, 8439.13] + - [374, 8439.13] - - [1408, 256, 1, 128] - - [235, 1769.17] + - [369, 1769.17] - - [2944, 5888, 1, 3328] - - [240, 9448.04] + - [374, 9448.04] - - [6784, 5888, 1, 1280] - - [251, 9372.25] + - [385, 9372.25] - - [704, 128, 1, 256] - - [197, 2059.8] + - [331, 2059.8] - - [5888, 4288, 1, 1280] - - [243, 9244.32] + - [377, 9244.32] - - [448, 256, 1, 1280] - - [222, 4741.72] + - [356, 4741.72] - - [5888, 3584, 1, 128] - - [231, 4980.06] + - [365, 4980.06] - - [1856, 1856, 1, 128] - - [235, 4363.98] + - [369, 4363.98] - - [5056, 4, 1, 1280] - - [255, 629.641] + - [389, 629.641] - - [256, 1408, 1, 1280] - - [246, 5588.44] + - [380, 5588.44] - - [512, 16, 1, 512] - - [206, 689.953] + - [340, 689.953] - - [704, 3584, 1, 128] - - [235, 4069.67] + - [369, 4069.67] - - [5888, 448, 1, 3328] - - [251, 7925.94] + - [385, 7925.94] - - [2368, 4288, 1, 1280] - - [250, 8492.7] + - [384, 8492.7] - - [4288, 2944, 1, 128] - - [232, 5238.21] + - [366, 5238.21] - - [1024, 6784, 1, 3328] - - [246, 8578.18] + - [380, 8578.18] - - [128, 2368, 1, 256] - - [246, 3788.9] + - [380, 3788.9] - - [6784, 64, 1, 3328] - - [240, 7003.46] + - [374, 7003.46] - - [5056, 2944, 1, 3328] - - [243, 8575.45] + - [377, 8575.45] - - [448, 128, 1, 256] - - [195, 1715.06] + - [329, 1715.06] - - [2944, 3584, 1, 256] - - [240, 8994.26] + - [374, 8994.26] - - [1408, 1408, 1, 3328] - - [238, 8757.7] + - [372, 8757.7] - - [1856, 128, 1, 1280] - - [240, 5598.17] + - [374, 5598.17] - - [3584, 3584, 1, 128] - - [231, 4787.44] + - [365, 4787.44] - - [64, 3584, 1, 256] - - [246, 3546.01] + - [380, 3546.01] - - [1408, 4, 1, 3328] - - [190, 640.24] + - [324, 640.24] - - [128, 2944, 1, 3328] - - [214, 7204.24] + - [348, 7204.24] - - [3584, 704, 1, 256] - - [240, 6239.69] + - [374, 6239.69] - - [2944, 448, 1, 3328] - - [246, 7726.71] + - [380, 7726.71] - - [3584, 1408, 1, 3328] - - [238, 9358.78] + - [372, 9358.78] - - [704, 3584, 1, 1280] - - [246, 8005.28] + - [380, 8005.28] - - [2944, 6784, 1, 1280] - - [238, 9487.73] + - [372, 9487.73] - - [1856, 6784, 1, 256] - - [240, 5684.56] + - [374, 5684.56] - - [4288, 448, 1, 3328] - - [246, 8410.38] + - [380, 8410.38] - - [6784, 4288, 1, 128] - - [236, 4785.58] + - [370, 4785.58] - - [6784, 704, 1, 1280] - - [240, 5579.05] + - [374, 5579.05] - - [256, 4288, 1, 256] - - [240, 6781.43] + - [374, 6781.43] - - [3584, 64, 1, 128] - - [179, 1474.0] + - [313, 1474.0] - - [5888, 1024, 1, 3328] - - [238, 8639.49] + - [372, 8639.49] - - [448, 64, 1, 128] - - [170, 259.282] + - [304, 259.282] - - [704, 6784, 1, 1280] - - [246, 9027.25] + - [380, 9027.25] - - [5888, 128, 1, 256] - - [246, 6812.88] + - [380, 6812.88] - - [2368, 448, 1, 3328] - - [246, 7356.63] + - [380, 7356.63] - - [1856, 5056, 1, 3328] - - [245, 8871.56] + - [379, 8871.56] - - [4, 6784, 1, 256] - - [254, 469.479] + - [388, 469.479] - - [1024, 3584, 1, 128] - - [232, 3428.02] + - [366, 3428.02] - - [1024, 1408, 1, 128] - - [235, 2935.05] + - [369, 2935.05] - - [2368, 2944, 1, 128] - - [235, 4888.02] + - [369, 4888.02] - - [5056, 64, 1, 256] - - [204, 3186.16] + - [338, 3186.16] - - [4, 448, 1, 1280] - - [209, 273.167] + - [343, 273.167] - - [5056, 2944, 1, 128] - - [236, 4752.79] + - [370, 4752.79] - - [5888, 5056, 1, 3328] - - [250, 9124.77] + - [384, 9124.77] - - [1024, 704, 1, 128] - - [235, 2302.36] + - [369, 2302.36] - - [1408, 2368, 1, 128] - - [235, 3826.95] + - [369, 3826.95] - - [5888, 2368, 1, 128] - - [232, 4912.77] + - [366, 4912.77] - - [128, 5056, 1, 3328] - - [222, 7583.8] + - [356, 7583.8] - - [3584, 6784, 1, 1280] - - [249, 9313.5] + - [383, 9313.5] - - [3072, 7435, 1, 1024] - - [243, 9322.07] + - [377, 9322.07] - - [1856, 5888, 1, 256] - - [240, 5778.34] + - [374, 5778.34] - - [256, 256, 1, 256] - - [192, 1576.91] + - [326, 1576.91] - - [256, 64, 1, 128] - - [178, 173.705] + - [312, 173.705] - - [4288, 4288, 1, 3328] - - [245, 8416.27] + - [379, 8416.27] - - [4288, 1408, 1, 1280] - - [251, 9301.97] + - [385, 9301.97] - - [3584, 5056, 1, 128] - - [237, 4344.94] + - [371, 4344.94] - - [4, 1024, 1, 3328] - - [206, 615.239] + - [340, 615.239] - - [4288, 2368, 1, 256] - - [240, 9142.67] + - [374, 9142.67] - - [2944, 5056, 1, 1280] - - [240, 9399.69] + - [374, 9399.69] - - [448, 6784, 1, 256] - - [239, 5710.93] + - [373, 5710.93] - - [64, 1024, 1, 3328] - - [222, 4975.1] + - [356, 4975.1] - - [6784, 2368, 1, 3328] - - [249, 9207.63] + - [383, 9207.63] - - [256, 1024, 1, 1280] - - [246, 5983.42] + - [380, 5983.42] - - [704, 4, 1, 128] - - [252, 15.1187] + - [386, 15.1187] - - [256, 4, 1, 256] - - [209, 52.9516] + - [343, 52.9516] - - [4288, 128, 1, 256] - - [240, 5242.98] + - [374, 5242.98] - - [4288, 1856, 1, 3328] - - [251, 9354.06] + - [385, 9354.06] - - [3584, 448, 1, 128] - - [232, 3353.9] + - [366, 3353.9] - - [256, 4, 1, 3328] - - [263, 313.324] + - [397, 313.324] - - [4, 1408, 1, 1280] - - [206, 509.207] + - [340, 509.207] - - [3584, 64, 1, 1280] - - [194, 5198.42] + - [328, 5198.42] - - [1408, 448, 1, 128] - - [232, 2628.37] + - [366, 2628.37] - - [3584, 1024, 1, 1280] - - [246, 8535.01] + - [380, 8535.01] - - [1856, 5056, 1, 256] - - [238, 8184.49] + - [372, 8184.49] - - [4, 3584, 1, 256] - - [256, 395.576] + - [390, 395.576] - - [1024, 4288, 1, 256] - - [241, 5966.52] + - [375, 5966.52] - - [5888, 3584, 1, 3328] - - [244, 9189.43] + - [378, 9189.43] - - [4, 256, 1, 256] - - [260, 41.5785] + - [394, 41.5785] - - [5056, 3584, 1, 3328] - - [245, 9431.92] + - [379, 9431.92] - - [128, 5888, 1, 1280] - - [240, 8192.1] + - [374, 8192.1] - - [704, 448, 1, 128] - - [232, 1510.96] + - [366, 1510.96] - - [2368, 1408, 1, 1280] - - [240, 8415.65] + - [374, 8415.65] - - [5056, 2944, 1, 1280] - - [251, 9294.77] + - [385, 9294.77] - - [4, 4, 1, 128] - - [253, 0.1356549] + - [387, 0.1356549] - - [3584, 256, 1, 256] - - [240, 6749.55] + - [374, 6749.55] - - [128, 1856, 1, 3328] - - [193, 6797.09] + - [327, 6797.09] - - [1024, 6784, 1, 256] - - [246, 8783.09] + - [380, 8783.09] - - [4, 128, 1, 256] - - [206, 27.4067] + - [340, 27.4067] - - [64, 64, 1, 1280] - - [225, 712.448] + - [359, 712.448] - - [6784, 4, 1, 128] - - [253, 122.06] + - [387, 122.06] - - [2944, 1408, 1, 128] - - [235, 4430.46] + - [369, 4430.46] - - [448, 128, 1, 3328] - - [222, 5097.34] + - [356, 5097.34] - - [64, 2944, 1, 3328] - - [222, 6362.2] + - [356, 6362.2] - - [64, 4288, 1, 3328] - - [222, 6565.01] + - [356, 6565.01] - - [5056, 6784, 1, 3328] - - [246, 8121.18] + - [380, 8121.18] - - [128, 2944, 1, 256] - - [240, 4692.17] + - [374, 4692.17] - - [128, 6784, 1, 128] - - [169, 2687.46] + - [303, 2687.46] - - [3584, 4288, 1, 256] - - [246, 9193.99] + - [380, 9193.99] - - [448, 1856, 1, 256] - - [246, 6231.39] + - [380, 6231.39] - - [1856, 6784, 1, 3328] - - [251, 9191.48] + - [385, 9191.48] - - [3584, 128, 1, 3328] - - [240, 7368.47] + - [374, 7368.47] - - [64, 1856, 1, 256] - - [191, 2184.63] + - [325, 2184.63] - - [1024, 448, 1, 1280] - - [246, 6977.32] + - [380, 6977.32] - - [5888, 4288, 1, 256] - - [246, 5780.5] + - [380, 5780.5] - - [4, 448, 1, 128] - - [253, 9.06] + - [387, 9.06] - - [5056, 1408, 1, 256] - - [240, 5601.35] + - [374, 5601.35] - - [64, 256, 1, 1280] - - [206, 1927.63] + - [340, 1927.63] - - [3584, 1024, 1, 256] - - [251, 7542.84] + - [385, 7542.84] - - [256, 704, 1, 256] - - [240, 2957.62] + - [374, 2957.62] - - [5888, 5888, 1, 256] - - [251, 7344.14] + - [385, 7344.14] - - [4288, 1024, 1, 1280] - - [246, 8925.84] + - [380, 8925.84] - - [5888, 128, 1, 3328] - - [240, 8410.07] + - [374, 8410.07] - - [448, 6784, 1, 3328] - - [240, 8862.56] + - [374, 8862.56] - - [2944, 1408, 1, 1280] - - [251, 7478.93] + - [385, 7478.93] - - [1024, 32, 1, 512] - - [195, 1777.35] + - [329, 1777.35] - - [2944, 1856, 1, 3328] - - [240, 9153.43] + - [374, 9153.43] - - [2368, 64, 1, 128] - - [179, 1102.3] + - [313, 1102.3] - - [2944, 2944, 1, 128] - - [231, 4591.95] + - [365, 4591.95] - - [4, 128, 1, 3328] - - [261, 119.09] + - [395, 119.09] - - [3584, 5888, 1, 1280] - - [240, 9222.49] + - [374, 9222.49] - - [64, 4, 1, 128] - - [252, 1.03516] + - [386, 1.03516] - - [6784, 1856, 1, 1280] - - [240, 9136.07] + - [374, 9136.07] - - [2944, 5056, 1, 256] - - [246, 8860.13] + - [380, 8860.13] - - [2944, 5888, 1, 1280] - - [239, 9643.63] + - [373, 9643.63] - - [5888, 256, 1, 3328] - - [246, 8799.53] + - [380, 8799.53] - - [1856, 5888, 1, 3328] - - [246, 9457.53] + - [380, 9457.53] - - [3584, 1408, 1, 256] - - [246, 8672.53] + - [380, 8672.53] - - [704, 3584, 1, 3328] - - [246, 8525.3] + - [380, 8525.3] - - [5056, 448, 1, 1280] - - [246, 8843.77] + - [380, 8843.77] - - [3584, 1856, 1, 3328] - - [238, 8881.53] + - [372, 8881.53] - - [64, 1408, 1, 128] - - [167, 747.142] + - [301, 747.142] - - [1408, 704, 1, 1280] - - [240, 8342.93] + - [374, 8342.93] - - [2944, 1024, 1, 256] - - [251, 8079.58] + - [385, 8079.58] - - [1024, 2368, 1, 128] - - [235, 3347.58] + - [369, 3347.58] - - [2368, 4288, 1, 3328] - - [246, 9467.67] + - [380, 9467.67] - - [4, 1408, 1, 256] - - [258, 257.563] + - [392, 257.563] - - [1024, 1408, 1, 1280] - - [246, 8241.84] + - [380, 8241.84] - - [64, 64, 1, 256] - - [206, 190.059] + - [340, 190.059] - - [704, 256, 1, 3328] - - [240, 4519.28] + - [374, 4519.28] - - [6784, 5056, 1, 256] - - [239, 9133.78] + - [373, 9133.78] - - [4, 4288, 1, 3328] - - [190, 670.075] + - [324, 670.075] - - [448, 6784, 1, 128] - - [232, 4481.92] + - [366, 4481.92] - - [4, 704, 1, 3328] - - [262, 523.071] + - [396, 523.071] - - [448, 2944, 1, 256] - - [240, 7022.59] + - [374, 7022.59] - - [2944, 6784, 1, 256] - - [246, 9199.84] + - [380, 9199.84] - - [2368, 2368, 1, 1280] - - [251, 8646.84] + - [385, 8646.84] - - [4, 4, 1, 1280] - - [209, 3.11176] + - [343, 3.11176] - - [1856, 3584, 1, 1280] - - [238, 8805.45] + - [372, 8805.45] - - [64, 2944, 1, 256] - - [212, 2565.76] + - [346, 2565.76] - - [3584, 1408, 1, 1280] - - [251, 9273.12] + - [385, 9273.12] - - [448, 256, 1, 128] - - [167, 941.13] + - [301, 941.13] - - [4288, 448, 1, 128] - - [233, 3215.2] + - [367, 3215.2] - - [5056, 256, 1, 1280] - - [246, 8790.13] + - [380, 8790.13] - - [1856, 1408, 1, 3328] - - [240, 9310.73] + - [374, 9310.73] - - [128, 128, 1, 128] - - [175, 155.215] + - [309, 155.215] - - [1024, 4288, 1, 3328] - - [243, 8528.12] + - [377, 8528.12] - - [448, 2368, 1, 256] - - [247, 5097.34] + - [381, 5097.34] - - [1024, 4, 1, 128] - - [253, 10.3721] + - [387, 10.3721] - - [5056, 448, 1, 256] - - [246, 8236.78] + - [380, 8236.78] - - [2944, 2368, 1, 3328] - - [239, 9331.16] + - [373, 9331.16] - - [704, 128, 1, 3328] - - [214, 5969.3] + - [348, 5969.3] - - [64, 64, 1, 3328] - - [230, 1494.78] + - [364, 1494.78] - - [1024, 1856, 1, 1280] - - [245, 6356.43] + - [379, 6356.43] - - [6784, 1856, 1, 256] - - [246, 9068.63] + - [380, 9068.63] - - [128, 2368, 1, 3328] - - [222, 6714.22] + - [356, 6714.22] - - [1024, 5888, 1, 256] - - [246, 5501.6] + - [380, 5501.6] - - [5056, 128, 1, 1280] - - [202, 6455.64] + - [336, 6455.64] - - [5056, 64, 1, 3328] - - [207, 6703.81] + - [341, 6703.81] - - [128, 704, 1, 128] - - [168, 696.618] + - [302, 696.618] - - [1408, 2368, 1, 256] - - [240, 8667.25] + - [374, 8667.25] - - [1408, 1408, 1, 256] - - [251, 7615.81] + - [385, 7615.81] - - [4, 64, 1, 128] - - [253, 1.08463] + - [387, 1.08463] - - [64, 128, 1, 1280] - - [225, 1379.81] + - [359, 1379.81] - - [2368, 2368, 1, 128] - - [235, 4582.26] + - [369, 4582.26] - - [64, 5888, 1, 128] - - [168, 2086.37] + - [302, 2086.37] - - [5888, 4, 1, 3328] - - [189, 667.514] + - [323, 667.514] - - [6784, 1408, 1, 128] - - [236, 4516.34] + - [370, 4516.34] - - [4288, 5888, 1, 256] - - [251, 8497.43] + - [385, 8497.43] - - [1408, 5056, 1, 256] - - [240, 8867.46] + - [374, 8867.46] - - [5056, 128, 1, 3328] - - [222, 7678.98] + - [356, 7678.98] - - [128, 128, 1, 1280] - - [210, 2016.59] + - [344, 2016.59] - - [448, 704, 1, 256] - - [241, 3030.89] + - [375, 3030.89] - - [4288, 3584, 1, 128] - - [232, 5246.33] + - [366, 5246.33] - - [2944, 128, 1, 3328] - - [207, 6795.16] + - [341, 6795.16] - - [128, 5056, 1, 1280] - - [193, 6193.09] + - [327, 6193.09] - - [3584, 5056, 1, 1280] - - [245, 9499.17] + - [379, 9499.17] - - [256, 448, 1, 1280] - - [201, 4267.56] + - [335, 4267.56] - - [704, 704, 1, 128] - - [235, 2259.32] + - [369, 2259.32] - - [5056, 4, 1, 128] - - [253, 12.5313] + - [387, 12.5313] - - [704, 256, 1, 1280] - - [240, 4355.97] + - [374, 4355.97] - - [64, 2368, 1, 3328] - - [214, 6310.97] + - [348, 6310.97] - - [1856, 1024, 1, 128] - - [231, 4065.43] + - [365, 4065.43] - - [1856, 64, 1, 128] - - [170, 936.329] + - [304, 936.329] - - [64, 6784, 1, 1280] - - [193, 5731.8] + - [327, 5731.8] - - [704, 4288, 1, 256] - - [246, 5218.9] + - [380, 5218.9] - - [5888, 2368, 1, 1280] - - [240, 9378.9] + - [374, 9378.9] - - [128, 256, 1, 256] - - [210, 1219.37] + - [344, 1219.37] - - [256, 64, 1, 1280] - - [212, 1820.54] + - [346, 1820.54] - - [2368, 5888, 1, 1280] - - [251, 9143.64] + - [385, 9143.64] - - [5888, 256, 1, 1280] - - [240, 8678.47] + - [374, 8678.47] - - [4, 5888, 1, 1280] - - [187, 668.242] + - [321, 668.242] - - [704, 128, 1, 128] - - [175, 649.556] + - [309, 649.556] - - [1024, 4, 1, 1280] - - [206, 478.465] + - [340, 478.465] - - [2368, 1856, 1, 3328] - - [238, 8153.87] + - [372, 8153.87] - - [2368, 128, 1, 128] - - [173, 1858.21] + - [307, 1858.21] - - [2944, 704, 1, 256] - - [240, 8438.07] + - [374, 8438.07] - - [5056, 128, 1, 128] - - [169, 2689.63] + - [303, 2689.63] - - [256, 704, 1, 3328] - - [240, 4541.18] + - [374, 4541.18] - - [704, 3584, 1, 256] - - [241, 7771.07] + - [375, 7771.07] - - [1024, 1024, 1, 1024] - - [246, 8305.62] + - [380, 8305.62] - - [704, 2944, 1, 3328] - - [246, 9166.48] + - [380, 9166.48] - - [6784, 1024, 1, 128] - - [231, 4362.31] + - [365, 4362.31] - - [256, 448, 1, 128] - - [178, 899.614] + - [312, 899.614] - - [448, 1024, 1, 3328] - - [240, 7385.56] + - [374, 7385.56] - - [2944, 1024, 1, 3328] - - [243, 8779.81] + - [377, 8779.81] - - [2944, 5056, 1, 128] - - [235, 5103.11] + - [369, 5103.11] - - [1408, 6784, 1, 256] - - [246, 8346.89] + - [380, 8346.89] - - [6784, 1408, 1, 3328] - - [242, 8878.4] + - [376, 8878.4] - - [4288, 6784, 1, 128] - - [231, 5432.99] + - [365, 5432.99] - - [704, 64, 1, 256] - - [220, 1441.89] + - [354, 1441.89] - - [5888, 4, 1, 1280] - - [257, 636.641] + - [391, 636.641] - - [256, 2368, 1, 3328] - - [240, 6804.8] + - [374, 6804.8] - - [6784, 2944, 1, 1280] - - [239, 9472.26] + - [373, 9472.26] - - [4288, 1856, 1, 128] - - [235, 4886.38] + - [369, 4886.38] - - [1856, 2944, 1, 128] - - [232, 4642.96] + - [366, 4642.96] - - [6784, 448, 1, 128] - - [232, 4369.17] + - [366, 4369.17] - - [64, 3584, 1, 128] - - [179, 1645.85] + - [313, 1645.85] - - [448, 5056, 1, 1280] - - [240, 8553.64] + - [374, 8553.64] - - [2368, 1856, 1, 128] - - [232, 4741.85] + - [366, 4741.85] - - [128, 448, 1, 1280] - - [222, 3745.01] + - [356, 3745.01] - - [4288, 704, 1, 256] - - [240, 8444.16] + - [374, 8444.16] - - [256, 3584, 1, 128] - - [232, 2454.96] + - [366, 2454.96] - - [5888, 704, 1, 256] - - [240, 8819.57] + - [374, 8819.57] - - [3584, 1024, 1, 128] - - [235, 4094.96] + - [369, 4094.96] - - [256, 5888, 1, 3328] - - [249, 8538.33] + - [383, 8538.33] - - [1408, 4288, 1, 3328] - - [251, 9212.57] + - [385, 9212.57] - - [6784, 4288, 1, 256] - - [239, 9163.12] + - [373, 9163.12] - - [4288, 256, 1, 128] - - [232, 3081.44] + - [366, 3081.44] - - [5888, 256, 1, 256] - - [240, 7680.75] + - [374, 7680.75] - - [6784, 1024, 1, 1280] - - [251, 9248.63] + - [385, 9248.63] - - [5888, 1024, 1, 128] - - [235, 4061.94] + - [369, 4061.94] - - [1024, 128, 1, 256] - - [246, 2317.39] + - [380, 2317.39] - - [128, 64, 1, 3328] - - [229, 2116.79] + - [363, 2116.79] - - [448, 64, 1, 256] - - [212, 1079.52] + - [346, 1079.52] - - [2368, 256, 1, 128] - - [233, 2229.83] + - [367, 2229.83] - - [6784, 3584, 1, 1280] - - [246, 9096.6] + - [380, 9096.6] - - [1024, 6784, 1, 1280] - - [244, 9112.9] + - [378, 9112.9] - - [2944, 64, 1, 1280] - - [202, 4983.0] + - [336, 4983.0] - - [1408, 2944, 1, 1280] - - [241, 9131.63] + - [375, 9131.63] - - [256, 1856, 1, 256] - - [249, 4432.86] + - [383, 4432.86] - - [1408, 2368, 1, 3328] - - [249, 8449.18] + - [383, 8449.18] - - [2944, 4, 1, 3328] - - [195, 673.94] + - [329, 673.94] - - [128, 1408, 1, 3328] - - [214, 6582.47] + - [348, 6582.47] - - [2944, 1856, 1, 128] - - [232, 4827.54] + - [366, 4827.54] - - [256, 2944, 1, 128] - - [235, 2416.66] + - [369, 2416.66] - - [256, 6784, 1, 128] - - [235, 3118.76] + - [369, 3118.76] - - [2368, 4, 1, 128] - - [253, 22.7197] + - [387, 22.7197] - - [1408, 256, 1, 3328] - - [240, 3733.82] + - [374, 3733.82] - - [1856, 4, 1, 128] - - [252, 7.20009] + - [386, 7.20009] - - [1024, 16, 1, 512] - - [208, 1165.18] + - [342, 1165.18] - - [5056, 6784, 1, 128] - - [236, 4949.13] + - [370, 4949.13] - - [4288, 5056, 1, 128] - - [235, 4966.9] + - [369, 4966.9] - - [1856, 5888, 1, 128] - - [231, 4351.76] + - [365, 4351.76] - - [2944, 5888, 1, 256] - - [251, 8460.99] + - [385, 8460.99] - - [3584, 1856, 1, 256] - - [246, 8876.7] + - [380, 8876.7] - - [4288, 3584, 1, 1280] - - [239, 9603.7] + - [373, 9603.7] - - [2368, 448, 1, 256] - - [240, 6604.7] + - [374, 6604.7] - - [4288, 256, 1, 3328] - - [240, 7619.89] + - [374, 7619.89] - - [1856, 704, 1, 128] - - [232, 3629.61] + - [366, 3629.61] - - [1408, 64, 1, 256] - - [196, 2168.21] + - [330, 2168.21] - - [64, 1856, 1, 128] - - [172, 979.762] + - [306, 979.762] - - [4, 256, 1, 128] - - [253, 5.23595] + - [387, 5.23595] - - [704, 4288, 1, 3328] - - [246, 9014.52] + - [380, 9014.52] - - [704, 5888, 1, 128] - - [233, 4221.77] + - [367, 4221.77] - - [6784, 3584, 1, 128] - - [231, 5360.73] + - [365, 5360.73] - - [1024, 64, 1, 256] - - [191, 1588.85] + - [325, 1588.85] - - [64, 2368, 1, 256] - - [246, 2552.55] + - [380, 2552.55] - - [4288, 5056, 1, 3328] - - [245, 8193.38] + - [379, 8193.38] - - [4, 1856, 1, 1280] - - [195, 499.192] + - [329, 499.192] - - [4288, 128, 1, 128] - - [232, 2373.57] + - [366, 2373.57] - - [1408, 1408, 1, 128] - - [235, 3753.88] + - [369, 3753.88] - - [1024, 128, 1, 3328] - - [217, 5656.32] + - [351, 5656.32] - - [1856, 128, 1, 128] - - [168, 1617.58] + - [302, 1617.58] - - [5056, 2368, 1, 256] - - [251, 5553.41] + - [385, 5553.41] - - [4288, 704, 1, 3328] - - [239, 6962.06] + - [373, 6962.06] - - [448, 3584, 1, 256] - - [249, 5981.5] + - [383, 5981.5] - - [64, 128, 1, 128] - - [186, 74.9983] + - [320, 74.9983] - - [2368, 64, 1, 1280] - - [222, 5041.33] + - [356, 5041.33] - - [2368, 1024, 1, 1280] - - [247, 7740.97] + - [381, 7740.97] - - [2944, 1408, 1, 3328] - - [249, 9204.65] + - [383, 9204.65] - - [1408, 448, 1, 256] - - [246, 5954.4] + - [380, 5954.4] - - [1024, 1408, 1, 3328] - - [243, 8161.54] + - [377, 8161.54] - - [2560, 7133, 1, 2560] - - [238, 9636.69] + - [372, 9636.69] - - [1408, 4, 1, 1280] - - [190, 520.979] + - [324, 520.979] - - [5888, 3584, 1, 256] - - [251, 9225.26] + - [385, 9225.26] - - [128, 1024, 1, 1280] - - [193, 4755.55] + - [327, 4755.55] - - [1408, 1856, 1, 3328] - - [243, 9130.87] + - [377, 9130.87] - - [4, 4, 1, 3328] - - [263, 7.03333] + - [397, 7.03333] - - [6784, 1408, 1, 1280] - - [240, 9346.91] + - [374, 9346.91] - - [4, 1024, 1, 1280] - - [190, 422.913] + - [324, 422.913] - - [704, 2944, 1, 256] - - [246, 8332.06] + - [380, 8332.06] - - [704, 4288, 1, 128] - - [232, 4371.14] + - [366, 4371.14] - - [2368, 4288, 1, 128] - - [232, 3988.89] + - [366, 3988.89] - - [64, 4288, 1, 1280] - - [222, 5407.63] + - [356, 5407.63] - - [6784, 64, 1, 1280] - - [202, 5708.25] + - [336, 5708.25] - - [3584, 128, 1, 128] - - [168, 2463.2] + - [302, 2463.2] - - [1024, 6784, 1, 128] - - [233, 3862.12] + - [367, 3862.12] - - [4, 1856, 1, 128] - - [253, 30.6362] + - [387, 30.6362] - - [1408, 64, 1, 3328] - - [222, 6095.48] + - [356, 6095.48] - - [6784, 4, 1, 256] - - [255, 487.938] + - [389, 487.938] - - [1408, 1408, 1, 1280] - - [251, 8640.63] + - [385, 8640.63] - - [256, 2368, 1, 256] - - [243, 4282.36] + - [377, 4282.36] - - [448, 4288, 1, 3328] - - [240, 8516.13] + - [374, 8516.13] - - [2368, 1408, 1, 256] - - [246, 8632.19] + - [380, 8632.19] - - [5888, 5056, 1, 128] - - [232, 5091.11] + - [366, 5091.11] - - [704, 2368, 1, 256] - - [246, 7664.8] + - [380, 7664.8] - - [2944, 448, 1, 1280] - - [246, 7618.35] + - [380, 7618.35] - - [5888, 2368, 1, 3328] - - [249, 9343.48] + - [383, 9343.48] - - [64, 2944, 1, 1280] - - [214, 5162.18] + - [348, 5162.18] - - [448, 1856, 1, 1280] - - [240, 7028.0] + - [374, 7028.0] - - [4288, 448, 1, 1280] - - [240, 5855.76] + - [374, 5855.76] - - [5888, 704, 1, 3328] - - [249, 9190.91] + - [383, 9190.91] - - [5056, 256, 1, 128] - - [235, 3235.94] + - [369, 3235.94] - - [1856, 256, 1, 128] - - [233, 1849.78] + - [367, 1849.78] - - [5056, 128, 1, 256] - - [246, 6109.06] + - [380, 6109.06] - - [704, 4, 1, 256] - - [206, 125.256] + - [340, 125.256] - - [1408, 5888, 1, 128] - - [232, 5055.16] + - [366, 5055.16] - - [4288, 4, 1, 128] - - [252, 95.7209] + - [386, 95.7209] - - [1408, 1024, 1, 256] - - [240, 7370.28] + - [374, 7370.28] - - [1024, 1856, 1, 128] - - [232, 2966.8] + - [366, 2966.8] - - [256, 704, 1, 128] - - [234, 528.229] + - [368, 528.229] - - [256, 1024, 1, 128] - - [232, 1171.69] + - [366, 1171.69] - - [448, 1024, 1, 256] - - [246, 5624.65] + - [380, 5624.65] - - [128, 4, 1, 3328] - - [263, 191.985] + - [397, 191.985] - - [5056, 6784, 1, 1280] - - [240, 9544.07] + - [374, 9544.07] - - [704, 5056, 1, 3328] - - [247, 8790.35] + - [381, 8790.35] - - [64, 1408, 1, 1280] - - [214, 4505.7] + - [348, 4505.7] - - [3584, 5056, 1, 3328] - - [245, 9073.52] + - [379, 9073.52] - - [1856, 4, 1, 3328] - - [263, 612.875] + - [397, 612.875] - - [4, 2944, 1, 128] - - [252, 72.0145] + - [386, 72.0145] - - [2368, 2944, 1, 3328] - - [238, 9314.68] + - [372, 9314.68] - - [448, 448, 1, 1280] - - [222, 5129.91] + - [356, 5129.91] - - [2368, 3584, 1, 256] - - [240, 8998.8] - - - [1024, 256, 1, 1280] - - [247, 3566.68] + - [374, 8998.8] - - [5056, 3584, 1, 1280] - - [241, 9345.17] + - [375, 9345.17] - - [448, 4, 1, 3328] - - [263, 487.337] + - [397, 487.337] - - [1856, 2944, 1, 1280] - - [251, 8438.79] + - [385, 8438.79] - - [3584, 2368, 1, 1280] - - [246, 9298.9] + - [380, 9298.9] - - [128, 1024, 1, 256] - - [198, 2356.45] + - [332, 2356.45] - - [2944, 1408, 1, 256] - - [238, 5440.82] + - [372, 5440.82] - - [4288, 1408, 1, 3328] - - [238, 9386.09] + - [372, 9386.09] - - [3584, 64, 1, 3328] - - [194, 6310.97] + - [328, 6310.97] - - [1408, 128, 1, 256] - - [240, 2942.53] + - [374, 2942.53] - - [2944, 1024, 1, 128] - - [235, 3927.99] + - [369, 3927.99] - - [4288, 5056, 1, 1280] - - [242, 8328.58] + - [376, 8328.58] - - [5888, 6784, 1, 1280] - - [251, 9757.44] + - [385, 9757.44] - - [6784, 5056, 1, 128] - - [231, 5101.4] + - [365, 5101.4] - - [256, 1024, 1, 3328] - - [240, 6475.87] + - [374, 6475.87] - - [3584, 4, 1, 256] - - [256, 420.973] + - [390, 420.973] - - [1856, 64, 1, 3328] - - [222, 6409.2] + - [356, 6409.2] - - [64, 6784, 1, 128] - - [170, 2387.32] + - [304, 2387.32] - - [5888, 1408, 1, 3328] - - [245, 9655.89] + - [379, 9655.89] - - [5888, 64, 1, 1280] - - [240, 5870.86] + - [374, 5870.86] - - [256, 5056, 1, 256] - - [243, 6109.06] + - [377, 6109.06] - - [128, 3584, 1, 128] - - [173, 2383.23] + - [307, 2383.23] - - [448, 3584, 1, 3328] - - [238, 7092.28] + - [372, 7092.28] - - [704, 2368, 1, 128] - - [232, 3741.08] + - [366, 3741.08] - - [5888, 256, 1, 128] - - [233, 2977.54] + - [367, 2977.54] - - [4, 5056, 1, 128] - - [252, 132.72] + - [386, 132.72] - - [448, 256, 1, 256] - - [204, 2308.29] + - [338, 2308.29] - - [704, 4, 1, 3328] - - [209, 552.674] + - [343, 552.674] - - [1408, 256, 1, 256] - - [240, 4577.22] + - [374, 4577.22] - - [3584, 1856, 1, 128] - - [232, 4571.86] + - [366, 4571.86] - - [4288, 4288, 1, 128] - - [235, 5284.65] + - [369, 5284.65] - - [1856, 1024, 1, 3328] - - [246, 6362.25] + - [380, 6362.25] - - [128, 5888, 1, 3328] - - [216, 7040.83] + - [350, 7040.83] - - [1024, 5056, 1, 256] - - [251, 7855.7] + - [385, 7855.7] - - [2368, 1408, 1, 3328] - - [246, 9205.66] + - [380, 9205.66] - - [5888, 448, 1, 256] - - [243, 5538.84] + - [377, 5538.84] - - [5888, 6784, 1, 128] - - [231, 4500.85] + - [365, 4500.85] - - [2368, 4, 1, 3328] - - [209, 642.898] + - [343, 642.898] - - [6784, 5056, 1, 1280] - - [247, 9249.23] + - [381, 9249.23] - - [5056, 704, 1, 1280] - - [246, 8883.37] + - [380, 8883.37] - - [1408, 256, 1, 1280] - - [240, 5632.1] + - [374, 5632.1] - - [4288, 6784, 1, 1280] - - [246, 8843.31] + - [380, 8843.31] - - [128, 704, 1, 256] - - [204, 2045.19] + - [338, 2045.19] - - [448, 128, 1, 1280] - - [214, 3807.17] + - [348, 3807.17] - - [6784, 4, 1, 3328] - - [257, 684.671] + - [391, 684.671] - - [4288, 4, 1, 1280] - - [206, 601.925] + - [340, 601.925] - - [1024, 64, 1, 3328] - - [218, 3928.48] + - [352, 3928.48] - - [1856, 4, 1, 256] - - [256, 293.394] + - [390, 293.394] - - [64, 3584, 1, 1280] - - [240, 5265.55] + - [374, 5265.55] - - [6784, 1408, 1, 256] - - [240, 9059.36] + - [374, 9059.36] - - [3584, 5888, 1, 128] - - [232, 5084.29] + - [366, 5084.29] - - [5056, 5888, 1, 256] - - [251, 8590.09] + - [385, 8590.09] - - [2368, 1024, 1, 256] - - [243, 4493.13] + - [377, 4493.13] - - [2944, 1856, 1, 256] - - [249, 5202.41] + - [383, 5202.41] - - [1856, 6784, 1, 1280] - - [247, 9071.48] + - [381, 9071.48] - - [64, 5056, 1, 128] - - [170, 2038.42] + - [304, 2038.42] - - [5888, 64, 1, 128] - - [169, 2016.59] + - [303, 2016.59] - - [448, 704, 1, 128] - - [233, 1173.65] + - [367, 1173.65] - - [4, 1024, 1, 128] - - [252, 8.89685] + - [386, 8.89685] - - [4288, 3584, 1, 256] - - [246, 9080.26] + - [380, 9080.26] - - [1408, 704, 1, 128] - - [232, 3165.71] + - [366, 3165.71] - - [64, 256, 1, 3328] - - [226, 3126.59] + - [360, 3126.59] - - [5056, 1856, 1, 1280] - - [243, 8857.55] + - [377, 8857.55] - - [1408, 1024, 1, 3328] - - [249, 8177.12] + - [383, 8177.12] - - [2368, 256, 1, 3328] - - [240, 6810.31] + - [374, 6810.31] - - [5888, 3584, 1, 1280] - - [238, 9535.55] + - [372, 9535.55] - - [1856, 3584, 1, 3328] - - [240, 9281.91] + - [374, 9281.91] - - [5888, 128, 1, 1280] - - [246, 8136.82] + - [380, 8136.82] - - [1024, 2944, 1, 256] - - [238, 7247.96] + - [372, 7247.96] - - [448, 6784, 1, 1280] - - [246, 7014.04] + - [380, 7014.04] - - [256, 3584, 1, 1280] - - [240, 7738.64] + - [374, 7738.64] - - [448, 128, 1, 128] - - [170, 496.048] + - [304, 496.048] - - [704, 5056, 1, 256] - - [246, 8609.44] + - [380, 8609.44] - - [3584, 1024, 1, 3328] - - [239, 7765.73] + - [373, 7765.73] - - [2944, 1856, 1, 1280] - - [251, 7776.03] + - [385, 7776.03] - - [128, 256, 1, 128] - - [183, 296.308] + - [317, 296.308] - - [5056, 256, 1, 256] - - [240, 7829.73] + - [374, 7829.73] - - [2368, 3584, 1, 3328] - - [239, 8896.08] + - [373, 8896.08] - - [2944, 704, 1, 1280] - - [249, 6855.83] + - [383, 6855.83] - - [128, 4, 1, 256] - - [258, 24.9242] + - [392, 24.9242] - - [2944, 3584, 1, 1280] - - [251, 9049.22] + - [385, 9049.22] - - [1856, 5888, 1, 1280] - - [246, 9432.06] + - [380, 9432.06] - - [256, 256, 1, 1280] - - [211, 3942.12] + - [345, 3942.12] - - [5056, 448, 1, 3328] - - [251, 4587.83] + - [385, 4587.83] - - [4288, 1408, 1, 256] - - [251, 5408.83] + - [385, 5408.83] - - [3584, 64, 1, 256] - - [220, 2496.71] + - [354, 2496.71] - - [64, 1856, 1, 3328] - - [193, 5896.78] + - [327, 5896.78] - - [256, 1408, 1, 128] - - [232, 1643.17] + - [366, 1643.17] - - [5888, 1408, 1, 128] - - [231, 4436.37] + - [365, 4436.37] - - [4288, 2368, 1, 1280] - - [240, 9433.04] + - [374, 9433.04] - - [4, 4288, 1, 256] - - [255, 442.732] + - [389, 442.732] - - [256, 4288, 1, 128] - - [232, 2814.79] + - [366, 2814.79] - - [256, 128, 1, 3328] - - [221, 3951.26] + - [355, 3951.26] - - [6784, 2368, 1, 256] - - [240, 9169.99] + - [374, 9169.99] - - [5888, 128, 1, 128] - - [169, 3156.81] + - [303, 3156.81] - - [4288, 1856, 1, 256] - - [246, 5658.23] + - [380, 5658.23] - - [1856, 256, 1, 3328] - - [240, 7646.37] + - [374, 7646.37] - - [1856, 2944, 1, 256] - - [247, 6444.98] + - [381, 6444.98] - - [5056, 1024, 1, 128] - - [231, 4607.3] + - [365, 4607.3] - - [64, 5888, 1, 1280] - - [246, 5842.46] + - [380, 5842.46] - - [1760, 7133, 1, 1760] - - [239, 9097.84] + - [373, 9097.84] - - [6784, 256, 1, 128] - - [232, 3685.41] + - [366, 3685.41] - - [5888, 704, 1, 128] - - [231, 3656.23] + - [365, 3656.23] - - [6784, 64, 1, 128] - - [182, 2191.52] + - [316, 2191.52] - - [1024, 4288, 1, 1280] - - [246, 9199.32] + - [380, 9199.32] - - [2368, 5056, 1, 3328] - - [242, 9072.88] + - [376, 9072.88] - - [448, 4, 1, 128] - - [253, 5.42937] + - [387, 5.42937] - - [4, 256, 1, 3328] - - [263, 311.037] + - [397, 311.037] - - [4288, 1024, 1, 3328] - - [244, 8660.33] + - [378, 8660.33] - - [1024, 5056, 1, 3328] - - [240, 8886.76] + - [374, 8886.76] - - [1024, 1856, 1, 3328] - - [245, 8426.24] + - [379, 8426.24] - - [704, 704, 1, 1280] - - [240, 7661.8] + - [374, 7661.8] - - [128, 2368, 1, 1280] - - [214, 5746.15] + - [348, 5746.15] - - [1408, 128, 1, 3328] - - [222, 6530.87] + - [356, 6530.87] - - [3584, 256, 1, 1280] - - [246, 7634.04] + - [380, 7634.04] - - [4, 128, 1, 128] - - [253, 2.07874] + - [387, 2.07874] - - [704, 6784, 1, 128] - - [235, 4589.59] + - [369, 4589.59] - - [3584, 128, 1, 1280] - - [240, 7078.24] + - [374, 7078.24] - - [4, 256, 1, 1280] - - [209, 178.187] + - [343, 178.187] - - [128, 704, 1, 3328] - - [214, 5959.81] + - [348, 5959.81] - - [4288, 6784, 1, 256] - - [240, 9326.54] + - [374, 9326.54] - - [3584, 2944, 1, 3328] - - [242, 9114.16] + - [376, 9114.16] - - [128, 1856, 1, 256] - - [246, 3672.65] + - [380, 3672.65] - - [64, 4288, 1, 256] - - [240, 3457.51] + - [374, 3457.51] - - [4, 3584, 1, 3328] - - [189, 694.37] + - [323, 694.37] - - [64, 4, 1, 3328] - - [209, 71.5738] + - [343, 71.5738] - - [4, 64, 1, 3328] - - [209, 91.9069] + - [343, 91.9069] - - [5888, 2944, 1, 256] - - [239, 7241.55] + - [373, 7241.55] - - [2368, 6784, 1, 128] - - [235, 5229.63] + - [369, 5229.63] - - [448, 4288, 1, 1280] - - [240, 8416.4] + - [374, 8416.4] - - [448, 1856, 1, 3328] - - [240, 7161.56] + - [374, 7161.56] - - [4, 1024, 1, 256] - - [206, 187.346] + - [340, 187.346] - - [5056, 4288, 1, 256] - - [251, 8947.26] + - [385, 8947.26] - - [1024, 448, 1, 256] - - [246, 5318.96] + - [380, 5318.96] - - [1024, 3584, 1, 256] - - [241, 6152.04] + - [375, 6152.04] - - [2944, 128, 1, 1280] - - [222, 6053.63] + - [356, 6053.63] - - [1856, 5056, 1, 128] - - [232, 5091.42] + - [366, 5091.42] - - [64, 256, 1, 256] - - [195, 771.112] + - [329, 771.112] - - [1408, 4, 1, 128] - - [252, 40.8758] + - [386, 40.8758] - - [128, 2368, 1, 128] - - [180, 1520.37] + - [314, 1520.37] - - [256, 704, 1, 1280] - - [240, 4329.81] + - [374, 4329.81] - - [64, 2368, 1, 128] - - [171, 1212.52] + - [305, 1212.52] - - [6784, 6784, 1, 3328] - - [251, 8310.67] + - [385, 8310.67] - - [448, 5888, 1, 1280] - - [246, 8502.33] + - [380, 8502.33] - - [5056, 448, 1, 128] - - [232, 4161.0] + - [366, 4161.0] - - [3584, 2944, 1, 128] - - [232, 4363.51] + - [366, 4363.51] - - [6784, 256, 1, 1280] - - [246, 8629.67] + - [380, 8629.67] - - [256, 2944, 1, 1280] - - [246, 7277.48] + - [380, 7277.48] - - [64, 4288, 1, 128] - - [171, 1822.06] + - [305, 1822.06] - - [2368, 5888, 1, 3328] - - [240, 9017.52] + - [374, 9017.52] - - [4, 64, 1, 256] - - [206, 16.1627] + - [340, 16.1627] - - [704, 1024, 1, 3328] - - [246, 8059.55] + - [380, 8059.55] - - [2368, 1856, 1, 1280] - - [246, 8813.24] + - [380, 8813.24] - - [128, 448, 1, 128] - - [167, 588.244] + - [301, 588.244] - - [128, 6784, 1, 256] - - [246, 6538.28] + - [380, 6538.28] - - [3584, 4288, 1, 128] - - [232, 5025.46] + - [366, 5025.46] - - [64, 448, 1, 128] - - [184, 231.793] + - [318, 231.793] - - [5888, 4288, 1, 3328] - - [240, 9515.88] + - [374, 9515.88] - - [2368, 704, 1, 256] - - [246, 7642.84] + - [380, 7642.84] - - [256, 1856, 1, 3328] - - [246, 6547.17] + - [380, 6547.17] - - [1856, 128, 1, 256] - - [240, 3782.28] + - [374, 3782.28] - - [6784, 128, 1, 128] - - [174, 2835.54] + - [308, 2835.54] - - [3584, 1408, 1, 128] - - [231, 3049.21] + - [365, 3049.21] - - [1856, 5056, 1, 1280] - - [247, 8863.3] + - [381, 8863.3] - - [2944, 1024, 1, 1280] - - [251, 8873.25] + - [385, 8873.25] - - [5056, 4, 1, 256] - - [187, 494.121] + - [321, 494.121] - - [3584, 5888, 1, 3328] - - [239, 9585.25] + - [373, 9585.25] - - [2368, 4288, 1, 256] - - [251, 6419.05] + - [385, 6419.05] - - [1024, 2368, 1, 3328] - - [246, 8645.36] + - [380, 8645.36] - - [64, 704, 1, 3328] - - [228, 4399.93] + - [362, 4399.93] - - [704, 1408, 1, 256] - - [240, 7428.54] + - [374, 7428.54] - - [6784, 1856, 1, 3328] - - [251, 9163.66] + - [385, 9163.66] - - [1024, 2944, 1, 128] - - [235, 3551.98] + - [369, 3551.98] - - [1024, 3584, 1, 1280] - - [249, 9112.47] + - [383, 9112.47] - - [4288, 5888, 1, 3328] - - [239, 8524.05] + - [373, 8524.05] - - [4288, 4, 1, 3328] - - [206, 620.016] + - [340, 620.016] - - [256, 1408, 1, 256] - - [240, 4505.7] + - [374, 4505.7] - - [448, 2944, 1, 1280] - - [240, 7612.87] + - [374, 7612.87] - - [4, 5888, 1, 128] - - [252, 174.564] + - [386, 174.564] - - [1024, 2944, 1, 3328] - - [245, 9136.74] + - [379, 9136.74] - - [3584, 6784, 1, 256] - - [245, 7253.89] + - [379, 7253.89] - - [256, 6784, 1, 1280] - - [240, 8637.72] + - [374, 8637.72] - - [1856, 3584, 1, 256] - - [246, 8199.67] + - [380, 8199.67] - - [128, 448, 1, 3328] - - [227, 4799.92] + - [361, 4799.92] - - [6784, 1856, 1, 128] - - [232, 5185.62] + - [366, 5185.62] - - [4, 448, 1, 256] - - [206, 86.9848] + - [340, 86.9848] - - [2944, 704, 1, 128] - - [235, 3798.64] + - [369, 3798.64] - - [256, 5888, 1, 1280] - - [240, 8678.47] + - [374, 8678.47] - - [4, 128, 1, 1280] - - [209, 102.5] + - [343, 102.5] - - [4288, 6784, 1, 3328] - - [245, 8209.4] + - [379, 8209.4] - - [6784, 128, 1, 1280] - - [222, 6562.99] + - [356, 6562.99] - - [64, 1408, 1, 256] - - [212, 2059.8] + - [346, 2059.8] - - [7680, 5481, 1, 2560] - - [251, 9426.79] + - [385, 9426.79] - - [2368, 1408, 1, 128] - - [232, 4532.5] + - [366, 4532.5] - - [1856, 448, 1, 256] - - [240, 6275.48] + - [374, 6275.48] - - [1408, 1024, 1, 128] - - [232, 3604.58] + - [366, 3604.58] - - [128, 64, 1, 128] - - [167, 87.4813] + - [301, 87.4813] - - [6784, 3584, 1, 3328] - - [247, 8991.92] + - [381, 8991.92] - - [2944, 64, 1, 3328] - - [216, 6043.36] + - [350, 6043.36] - - [64, 64, 1, 128] - - [172, 36.309] + - [306, 36.309] - - [2368, 5056, 1, 1280] - - [246, 9438.48] + - [380, 9438.48] - - [64, 4, 1, 1280] - - [209, 40.2569] + - [343, 40.2569] - - [1408, 2368, 1, 1280] - - [242, 7738.16] + - [376, 7738.16] - - [128, 1408, 1, 1280] - - [214, 4937.74] + - [348, 4937.74] - - [256, 64, 1, 3328] - - [224, 2683.46] + - [358, 2683.46] - - [2944, 4288, 1, 128] - - [232, 5173.81] + - [366, 5173.81] - - [2944, 2944, 1, 256] - - [240, 8943.92] + - [374, 8943.92] - - [2944, 4, 1, 1280] - - [189, 617.857] + - [323, 617.857] - - [5888, 4, 1, 256] - - [255, 483.218] + - [389, 483.218] - - [6784, 256, 1, 256] - - [246, 7916.7] + - [380, 7916.7] - - [256, 5056, 1, 3328] - - [240, 8953.25] + - [374, 8953.25] - - [128, 4288, 1, 1280] - - [193, 6015.05] + - [327, 6015.05] - - [5056, 1856, 1, 128] - - [234, 4221.15] + - [368, 4221.15] - - [5888, 1408, 1, 256] - - [245, 9144.85] + - [379, 9144.85] - - [128, 128, 1, 256] - - [195, 759.938] + - [329, 759.938] - - [5056, 4, 1, 3328] - - [255, 642.818] + - [389, 642.818] - - [4288, 3584, 1, 3328] - - [241, 9300.05] + - [375, 9300.05] - - [448, 704, 1, 3328] - - [247, 4481.08] + - [381, 4481.08] - - [448, 448, 1, 128] - - [171, 1360.81] + - [305, 1360.81] - - [1024, 2368, 1, 1280] - - [240, 8570.29] + - [374, 8570.29] - - [1856, 704, 1, 3328] - - [240, 8448.26] + - [374, 8448.26] - - [4, 2368, 1, 128] - - [252, 64.5902] + - [386, 64.5902] - - [5888, 6784, 1, 3328] - - [247, 9447.12] + - [381, 9447.12] - - [704, 4288, 1, 1280] - - [249, 7476.87] + - [383, 7476.87] - - [704, 256, 1, 256] - - [240, 2957.62] + - [374, 2957.62] - - [6784, 448, 1, 3328] - - [243, 8886.22] + - [377, 8886.22] - - [4288, 1024, 1, 128] - - [231, 3864.49] + - [365, 3864.49] - - [49, 512, 128, 2048] - - [274, 7112.78] + - [408, 7112.78] - - [196, 256, 256, 1024] - - [268, 8302.7] + - [402, 8302.7] - - [784, 512, 256, 128] - - [266, 9061.36] + - [400, 9061.36] - - [49, 2048, 128, 512] - - [264, 6963.36] + - [398, 6963.36] - - [784, 512, 64, 128] - - [266, 8822.62] + - [400, 8822.62] - - [784, 128, 128, 512] - - [273, 8983.63] + - [407, 8983.63] - - [196, 256, 64, 1024] - - [272, 7823.5] + - [406, 7823.5] - - [3136, 256, 256, 64] - - [269, 9051.38] + - [403, 9051.38] - - [3136, 64, 128, 64] - - [265, 8581.35] + - [399, 8581.35] - - [49, 2048, 256, 512] - - [264, 7049.64] + - [398, 7049.64] - - [196, 1024, 64, 256] - - [267, 7953.69] + - [401, 7953.69] - - [784, 128, 256, 512] - - [275, 9102.99] + - [409, 9102.99] - - [196, 256, 128, 1024] - - [267, 8085.89] + - [401, 8085.89] - - [3136, 64, 64, 256] - - [271, 9266.13] + - [405, 9266.13] - - [784, 128, 64, 512] - - [272, 8809.39] + - [406, 8809.39] - - [49, 2048, 64, 512] - - [264, 6843.95] + - [398, 6843.95] - - [3136, 64, 128, 256] - - [271, 9381.39] + - [405, 9381.39] - - [3136, 256, 128, 64] - - [269, 8982.64] + - [403, 8982.64] - - [784, 512, 128, 128] - - [266, 8965.99] + - [400, 8965.99] - - [3136, 256, 64, 64] - - [269, 8879.8] + - [403, 8879.8] - - [3136, 64, 256, 256] - - [271, 9566.43] + - [405, 9566.43] - - [3136, 64, 64, 64] - - [270, 8314.05] + - [404, 8314.05] - - [3136, 64, 256, 64] - - [265, 8743.8] + - [399, 8743.8] - - [196, 1024, 128, 256] - - [268, 8119.43] + - [402, 8119.43] - - [49, 512, 64, 2048] - - [276, 7055.41] + - [410, 7055.41] - - [49, 512, 256, 2048] - - [277, 7166.41] + - [411, 7166.41] - - [196, 1024, 256, 256] - - [268, 8210.66] + - [402, 8210.66] - - [5329, 160, 64, 64] - - [284, 8156.89] + - [418, 8156.89] - - [1225, 288, 64, 48] - - [288, 6926.23] + - [422, 6926.23] - - [1225, 192, 64, 64] - - [290, 7840.1] + - [424, 7840.1] - - [64, 1280, 64, 384] - - [291, 9276.11] + - [425, 9276.11] - - [1225, 384, 64, 192] - - [281, 9162.35] + - [415, 9162.35] - - [1225, 288, 64, 64] - - [282, 7495.27] + - [416, 7495.27] - - [5329, 64, 64, 80] - - [283, 8480.13] + - [417, 8480.13] - - [289, 1024, 64, 256] - - [281, 8483.83] + - [415, 8483.83] - - [289, 768, 64, 192] - - [287, 8234.84] + - [421, 8234.84] - - [289, 768, 64, 128] - - [287, 7988.81] + - [421, 7988.81] - - [64, 1536, 64, 384] - - [291, 9323.65] + - [425, 9323.65] - - [1225, 384, 64, 64] - - [290, 8158.8] + - [424, 8158.8] - - [64, 2048, 64, 192] - - [287, 8818.61] + - [421, 8818.61] - - [64, 1280, 64, 320] - - [283, 9202.17] + - [417, 9202.17] - - [1225, 384, 64, 96] - - [281, 8540.7] + - [415, 8540.7] - - [64, 1280, 64, 448] - - [287, 9317.82] + - [421, 9317.82] - - [289, 768, 64, 160] - - [291, 8128.81] + - [425, 8128.81] - - [1225, 192, 64, 32] - - [290, 6495.37] + - [424, 6495.37] - - [64, 1536, 64, 256] - - [287, 9143.0] + - [421, 9143.0] - - [1225, 256, 64, 48] - - [285, 7545.36] + - [419, 7545.36] - - [1225, 256, 64, 64] - - [286, 7972.45] + - [420, 7972.45] - - [1225, 192, 64, 48] - - [289, 7348.9] + - [423, 7348.9] - - [289, 1024, 64, 384] - - [279, 8725.66] + - [413, 8725.66] - - [289, 1024, 64, 192] - - [281, 8313.16] + - [415, 8313.16] - - [64, 1280, 64, 192] - - [283, 8768.68] + - [417, 8768.68] - - [64, 2048, 64, 320] - - [280, 9147.98] + - [414, 9147.98] - - [64, 2048, 64, 448] - - [278, 9304.16] + - [412, 9304.16] - - [64, 2048, 64, 384] - - [280, 9235.28] + - [414, 9235.28] - - [289, 1024, 64, 128] - - [287, 7989.51] + - [421, 7989.51] - - [4096, 1024, 1, 2984] - - [326, 9846.39] + - [460, 9846.39] - - [1024, 4096, 1, 3437] - - [327, 9915.8] + - [461, 9915.8] - - [1024, 4096, 1, 3235] - - [320, 9914.02] + - [454, 9914.02] - - [4096, 1024, 1, 4032] - - [326, 9926.06] + - [460, 9926.06] - - [1024, 4096, 1, 3334] - - [327, 9918.27] + - [461, 9918.27] - - [4096, 1024, 1, 3288] - - [327, 9854.67] + - [461, 9854.67] - - [1024, 4096, 1, 3515] - - [327, 9924.03] + - [461, 9924.03] - - [4096, 1024, 1, 3437] - - [327, 9869.63] + - [461, 9869.63] - - [1024, 4096, 1, 3259] - - [327, 9907.65] + - [461, 9907.65] - - [1024, 4096, 1, 3384] - - [319, 9921.21] + - [453, 9921.21] - - [64, 92, 688, 92] - - [297, 6137.89] + - [431, 6137.89] - - [4096, 1024, 1, 3458] - - [326, 9887.69] + - [460, 9887.69] - - [1024, 4096, 1, 3412] - - [326, 9930.56] + - [460, 9930.56] - - [1024, 4096, 1, 3529] - - [320, 9924.54] + - [454, 9924.54] - - [1024, 4096, 1, 4032] - - [327, 9963.48] + - [461, 9963.48] - - [4096, 1024, 1, 3999] - - [327, 9895.0] + - [461, 9895.0] - - [1024, 4096, 1, 3079] - - [320, 9894.58] + - [454, 9894.58] - - [1024, 4096, 1, 3876] - - [319, 9949.39] + - [453, 9949.39] - - [1024, 4096, 1, 3450] - - [327, 9915.65] + - [461, 9915.65] - - [1024, 4096, 1, 3256] - - [327, 9911.18] + - [461, 9911.18] - - [4096, 1024, 1, 3403] - - [326, 9858.93] + - [460, 9858.93] - - [1024, 1024, 1, 3975] - - [317, 8990.81] + - [451, 8990.81] - - [1024, 4096, 1, 3359] - - [327, 9915.0] + - [461, 9915.0] - - [4096, 1024, 1, 3549] - - [326, 9870.66] + - [460, 9870.66] - - [4096, 1024, 1, 3176] - - [327, 9855.92] + - [461, 9855.92] - - [1024, 4096, 1, 3504] - - [319, 9934.17] + - [453, 9934.17] - - [4096, 1024, 1, 3314] - - [326, 9873.9] + - [460, 9873.9] - - [4096, 1024, 1, 3183] - - [326, 9843.84] + - [460, 9843.84] - - [1024, 4096, 1, 3209] - - [320, 9904.97] + - [454, 9904.97] - - [1024, 4096, 1, 3720] - - [319, 9934.16] + - [453, 9934.16] - - [1024, 4096, 1, 3859] - - [319, 9952.53] + - [453, 9952.53] - - [1024, 33708, 1, 4059] - - [319, 10321.5] + - [453, 10321.5] - - [1024, 4096, 1, 3968] - - [319, 9955.96] + - [453, 9955.96] - - [64, 123, 528, 123] - - [292, 6916.21] + - [426, 6916.21] - - [4096, 1024, 1, 3477] - - [327, 9872.03] + - [461, 9872.03] - - [4096, 1024, 1, 3233] - - [327, 9862.35] + - [461, 9862.35] - - [4096, 1024, 1, 3409] - - [327, 9876.86] + - [461, 9876.86] - - [4096, 1024, 1, 3564] - - [327, 9870.49] + - [461, 9870.49] - - [64, 102, 624, 100] - - [292, 5773.16] + - [426, 5773.16] - - [4096, 1024, 1, 3190] - - [326, 9850.97] + - [460, 9850.97] - - [64, 112, 576, 111] - - [292, 6517.35] + - [426, 6517.35] - - [1024, 4096, 1, 3288] - - [326, 9911.9] + - [460, 9911.9] - - [4096, 1024, 1, 3451] - - [326, 9859.61] + - [460, 9859.61] - - [1024, 4096, 1, 3348] - - [319, 9915.47] + - [453, 9915.47] - - [64, 102, 624, 102] - - [292, 5783.7] + - [426, 5783.7] - - [1024, 4096, 1, 3465] - - [320, 9913.12] + - [454, 9913.12] - - [1024, 33708, 1, 4032] - - [319, 10340.4] + - [453, 10340.4] - - [1024, 33708, 1, 3840] - - [319, 10341.8] + - [453, 10341.8] - - [4096, 1024, 1, 3391] - - [327, 9861.77] + - [461, 9861.77] - - [1024, 4096, 1, 3530] - - [319, 9920.44] + - [453, 9920.44] - - [4096, 1024, 1, 3209] - - [326, 9847.0] + - [460, 9847.0] - - [1024, 4096, 1, 3457] - - [320, 9917.29] + - [454, 9917.29] - - [1024, 4096, 1, 3386] - - [319, 9917.65] + - [453, 9917.65] - - [4096, 1024, 1, 3350] - - [326, 9884.54] + - [460, 9884.54] - - [1024, 4096, 1, 3184] - - [327, 9925.98] + - [461, 9925.98] - - [1024, 4096, 1, 3093] - - [326, 9902.55] + - [460, 9902.55] - - [64, 133, 480, 135] - - [309, 6205.97] + - [443, 6205.97] - - [1024, 4096, 1, 3400] - - [319, 9917.1] + - [453, 9917.1] - - [1024, 1024, 1, 4026] - - [325, 9014.39] + - [459, 9014.39] - - [1024, 4096, 1, 3214] - - [319, 9895.94] + - [453, 9895.94] - - [4096, 1024, 1, 3406] - - [327, 9857.82] + - [461, 9857.82] - - [1024, 4096, 1, 3565] - - [326, 9919.37] + - [460, 9919.37] - - [4096, 1024, 1, 3536] - - [327, 9889.06] + - [461, 9889.06] - - [1024, 4096, 1, 3183] - - [326, 9907.55] + - [460, 9907.55] - - [1024, 4096, 1, 3462] - - [327, 9922.4] + - [461, 9922.4] - - [4096, 1024, 1, 3130] - - [320, 9846.04] + - [454, 9846.04] - - [4096, 1024, 1, 3381] - - [327, 9868.27] + - [461, 9868.27] - - [4096, 1024, 1, 3298] - - [326, 9870.54] + - [460, 9870.54] - - [1024, 4096, 1, 3292] - - [319, 9906.3] + - [453, 9906.3] - - [4096, 1024, 1, 3289] - - [326, 9856.55] + - [460, 9856.55] - - [64, 160, 400, 159] - - [312, 7427.84] + - [446, 7427.84] - - [1024, 4096, 1, 3379] - - [319, 9917.09] + - [453, 9917.09] - - [1024, 4096, 1, 3990] - - [320, 9947.37] + - [454, 9947.37] - - [1024, 4096, 1, 3540] - - [327, 9935.76] + - [461, 9935.76] - - [4096, 1024, 1, 3412] - - [327, 9867.56] + - [461, 9867.56] - - [1024, 1024, 1, 3780] - - [322, 9036.26] + - [456, 9036.26] - - [1024, 4096, 1, 3555] - - [326, 9927.37] + - [460, 9927.37] - - [1024, 4096, 1, 3518] - - [320, 9925.55] + - [454, 9925.55] - - [4096, 1024, 1, 3189] - - [326, 9861.24] + - [460, 9861.24] - - [1024, 4096, 1, 3298] - - [320, 9923.22] + - [454, 9923.22] - - [4096, 1024, 1, 3072] - - [326, 9872.08] + - [460, 9872.08] - - [1024, 4096, 1, 3393] - - [327, 9929.28] + - [461, 9929.28] - - [1024, 4096, 1, 3207] - - [319, 9912.81] + - [453, 9912.81] - - [64, 228, 272, 232] - - [315, 7350.14] + - [449, 7350.14] - - [64, 23, 2720, 23] - - [296, 2640.25] + - [430, 2640.25] - - [4096, 1024, 1, 3487] - - [327, 9860.91] + - [461, 9860.91] - - [1024, 1024, 1, 3822] - - [325, 8993.96] + - [459, 8993.96] - - [64, 77, 816, 77] - - [297, 5273.19] + - [431, 5273.19] - - [4096, 1024, 1, 3431] - - [327, 9867.53] + - [461, 9867.53] - - [4096, 1024, 1, 3378] - - [326, 9888.14] + - [460, 9888.14] - - [4096, 1024, 1, 3529] - - [320, 9879.5] + - [454, 9879.5] - - [4096, 1024, 1, 3460] - - [327, 9877.25] + - [461, 9877.25] - - [1024, 4096, 1, 3336] - - [319, 9912.41] + - [453, 9912.41] - - [1024, 4096, 1, 3501] - - [320, 9914.4] + - [454, 9914.4] - - [64, 159, 400, 159] - - [310, 7016.51] + - [444, 7016.51] - - [1024, 4096, 1, 3584] - - [327, 9940.59] + - [461, 9940.59] - - [64, 135, 480, 134] - - [310, 6241.39] + - [444, 6241.39] - - [64, 99, 624, 99] - - [301, 5617.39] + - [435, 5617.39] - - [4096, 1024, 1, 2499] - - [326, 9813.57] + - [460, 9813.57] - - [1024, 1024, 1, 3942] - - [322, 9060.01] + - [456, 9060.01] - - [4096, 1024, 1, 3352] - - [326, 9867.12] + - [460, 9867.12] - - [1024, 4096, 1, 3543] - - [327, 9928.77] + - [461, 9928.77] - - [1024, 4096, 1, 3476] - - [326, 9931.58] + - [460, 9931.58] - - [1024, 33708, 1, 3822] - - [319, 10324.7] + - [453, 10324.7] - - [1024, 4096, 1, 3436] - - [319, 9917.28] + - [453, 9917.28] - - [1024, 1024, 1, 3861] - - [318, 8998.49] + - [452, 8998.49] - - [1024, 1024, 1, 4000] - - [323, 9058.3] + - [457, 9058.3] - - [1024, 4096, 1, 3594] - - [319, 9927.88] + - [453, 9927.88] - - [4096, 1024, 1, 3514] - - [327, 9872.3] + - [461, 9872.3] - - [1024, 4096, 1, 3064] - - [326, 9907.1] + - [460, 9907.1] - - [4096, 1024, 1, 3371] - - [319, 9857.74] + - [453, 9857.74] - - [4096, 1024, 1, 3558] - - [327, 9876.31] + - [461, 9876.31] - - [4096, 1024, 1, 3517] - - [326, 9866.45] + - [460, 9866.45] - - [4096, 1024, 1, 3144] - - [326, 9846.36] + - [460, 9846.36] - - [1024, 4096, 1, 3312] - - [319, 9932.85] + - [453, 9932.85] - - [4096, 1024, 1, 3079] - - [326, 9851.1] + - [460, 9851.1] - - [1024, 4096, 1, 3415] - - [319, 9919.47] + - [453, 9919.47] - - [1024, 4096, 1, 3221] - - [326, 9908.18] + - [460, 9908.18] - - [1024, 4096, 1, 3978] - - [320, 9944.41] + - [454, 9944.41] - - [4096, 1024, 1, 3876] - - [326, 9898.99] + - [460, 9898.99] - - [1024, 4096, 1, 3528] - - [319, 9919.6] + - [453, 9919.6] - - [1024, 4096, 1, 3181] - - [327, 9894.86] + - [461, 9894.86] - - [4096, 1024, 1, 3445] - - [326, 9878.54] + - [460, 9878.54] - - [4096, 1024, 1, 3450] - - [319, 9864.82] + - [453, 9864.82] - - [4096, 1024, 1, 3377] - - [326, 9879.69] + - [460, 9879.69] - - [1024, 4096, 1, 3532] - - [320, 9928.19] + - [454, 9928.19] - - [1024, 33708, 1, 3944] - - [319, 10329.7] + - [453, 10329.7] - - [4096, 1024, 1, 3483] - - [326, 9861.83] + - [460, 9861.83] - - [1024, 4096, 1, 3358] - - [319, 9903.69] + - [453, 9903.69] - - [4096, 1024, 1, 3464] - - [326, 9876.84] + - [460, 9876.84] - - [4096, 1024, 1, 3282] - - [319, 9859.23] + - [453, 9859.23] - - [4096, 1024, 1, 3256] - - [327, 9855.1] + - [461, 9855.1] - - [1024, 4096, 1, 3057] - - [326, 9910.75] + - [460, 9910.75] - - [4096, 1024, 1, 3481] - - [326, 9866.29] + - [460, 9866.29] - - [4096, 1024, 1, 3340] - - [326, 9862.25] + - [460, 9862.25] - - [1024, 1024, 1, 3870] - - [325, 9082.45] + - [459, 9082.45] - - [1024, 4096, 1, 3273] - - [319, 9916.29] + - [453, 9916.29] - - [64, 65, 992, 65] - - [310, 4683.01] + - [444, 4683.01] - - [4096, 1024, 1, 3392] - - [320, 9881.12] + - [454, 9881.12] - - [4096, 1024, 1, 3337] - - [326, 9864.5] + - [460, 9864.5] - - [4096, 1024, 1, 3359] - - [326, 9874.42] + - [460, 9874.42] - - [4096, 1024, 1, 3498] - - [327, 9864.35] + - [461, 9864.35] - - [4096, 1024, 1, 3169] - - [326, 9851.1] + - [460, 9851.1] - - [1024, 33708, 1, 3859] - - [320, 10332.6] + - [454, 10332.6] - - [64, 19, 3264, 19] - - [296, 2182.14] + - [430, 2182.14] - - [1024, 4096, 1, 3103] - - [319, 9898.9] + - [453, 9898.9] - - [4096, 1024, 1, 3900] - - [326, 9897.12] + - [460, 9897.12] - - [1024, 4096, 1, 3442] - - [326, 9938.97] + - [460, 9938.97] - - [1024, 4096, 1, 3248] - - [326, 9939.92] + - [460, 9939.92] - - [1024, 4096, 1, 3351] - - [327, 9923.23] + - [461, 9923.23] - - [4096, 1024, 1, 3593] - - [326, 9894.36] + - [460, 9894.36] - - [1024, 4096, 1, 3780] - - [326, 9941.96] + - [460, 9941.96] - - [64, 133, 480, 133] - - [310, 6180.79] + - [444, 6180.79] - - [1024, 33708, 1, 3681] - - [319, 10332.3] + - [453, 10332.3] - - [4096, 1024, 1, 3374] - - [320, 9859.36] + - [454, 9859.36] - - [1024, 4096, 1, 3557] - - [319, 9928.2] + - [453, 9928.2] - - [4096, 1024, 1, 3906] - - [326, 9907.07] + - [460, 9907.07] - - [4096, 1024, 1, 3504] - - [326, 9886.05] + - [460, 9886.05] - - [1024, 4096, 1, 3270] - - [326, 9916.37] + - [460, 9916.37] - - [4096, 1024, 1, 3098] - - [319, 9854.76] + - [453, 9854.76] - - [64, 232, 272, 232] - - [315, 7394.1] + - [449, 7394.1] - - [4096, 1024, 1, 3216] - - [327, 9876.57] + - [461, 9876.57] - - [64, 148, 432, 148] - - [312, 6663.85] + - [446, 6663.85] - - [1024, 4096, 1, 3550] - - [326, 9920.28] + - [460, 9920.28] - - [4096, 1024, 1, 3449] - - [320, 9870.57] + - [454, 9870.57] - - [1024, 4096, 1, 3403] - - [327, 9908.21] + - [461, 9908.21] - - [1024, 4096, 1, 3523] - - [326, 9932.71] + - [460, 9932.71] - - [1024, 4096, 1, 3486] - - [326, 9917.46] + - [460, 9917.46] - - [1024, 4096, 1, 3564] - - [326, 9923.44] + - [460, 9923.44] - - [1024, 33708, 1, 4005] - - [319, 10339.5] + - [453, 10339.5] - - [4096, 1024, 1, 3296] - - [326, 9879.78] + - [460, 9879.78] - - [1024, 4096, 1, 3263] - - [319, 9907.17] + - [453, 9907.17] - - [64, 25, 2512, 25] - - [296, 2848.17] + - [430, 2848.17] - - [1024, 4096, 1, 3130] - - [327, 9900.1] + - [461, 9900.1] - - [1024, 4096, 1, 3295] - - [327, 9895.45] + - [461, 9895.45] - - [1024, 33708, 1, 3925] - - [320, 10342.3] + - [454, 10342.3] - - [1024, 4096, 1, 3378] - - [319, 9921.37] + - [453, 9921.37] - - [4096, 1024, 1, 3720] - - [327, 9885.82] + - [461, 9885.82] - - [4096, 1024, 1, 3399] - - [326, 9880.65] + - [460, 9880.65] - - [4096, 1024, 1, 3543] - - [327, 9870.73] + - [461, 9870.73] - - [64, 9, 6544, 9] - - [299, 955.17] + - [433, 955.17] - - [4096, 1024, 1, 3497] - - [326, 9868.43] + - [460, 9868.43] - - [4096, 1024, 1, 3594] - - [327, 9876.88] + - [461, 9876.88] - - [1024, 4096, 1, 3144] - - [327, 9901.96] + - [461, 9901.96] - - [1024, 4096, 1, 3975] - - [320, 9950.19] + - [454, 9950.19] - - [4096, 1024, 1, 3205] - - [327, 9856.07] + - [461, 9856.07] - - [1024, 33708, 1, 3995] - - [319, 10331.1] + - [453, 10331.1] - - [1024, 4096, 1, 3392] - - [319, 9935.78] + - [453, 9935.78] - - [1024, 4096, 1, 3055] - - [327, 9893.25] + - [461, 9893.25] - - [1024, 4096, 1, 4026] - - [327, 9940.22] + - [461, 9940.22] - - [4096, 1024, 1, 3557] - - [326, 9884.0] + - [460, 9884.0] - - [4096, 1024, 1, 3515] - - [326, 9871.94] + - [460, 9871.94] - - [4096, 1024, 1, 3486] - - [327, 9860.74] + - [461, 9860.74] - - [4096, 1024, 1, 3457] - - [327, 9885.37] + - [461, 9885.37] - - [1024, 4096, 1, 3511] - - [319, 9928.24] + - [453, 9928.24] - - [4096, 1024, 1, 3138] - - [326, 9854.06] + - [460, 9854.06] - - [1024, 4096, 1, 3339] - - [320, 9912.89] + - [454, 9912.89] - - [1024, 4096, 1, 3939] - - [320, 9952.26] + - [454, 9952.26] - - [4096, 1024, 1, 3500] - - [320, 9863.62] + - [454, 9863.62] - - [4096, 1024, 1, 3395] - - [327, 9883.82] + - [461, 9883.82] - - [4096, 1024, 1, 3968] - - [327, 9920.36] + - [461, 9920.36] - - [4096, 1024, 1, 4020] - - [327, 9912.81] + - [461, 9912.81] - - [4096, 1024, 1, 3942] - - [326, 9910.17] + - [460, 9910.17] - - [1024, 1024, 1, 4032] - - [316, 9024.74] + - [450, 9024.74] - - [4096, 1024, 1, 3349] - - [327, 9866.04] + - [461, 9866.04] - - [1024, 4096, 1, 3322] - - [320, 9908.43] + - [454, 9908.43] - - [4096, 1024, 1, 3452] - - [326, 9872.69] + - [460, 9872.69] - - [1024, 4096, 1, 3417] - - [326, 9912.64] + - [460, 9912.64] - - [1024, 1024, 1, 4012] - - [324, 9085.47] + - [458, 9085.47] - - [1024, 4096, 1, 3526] - - [320, 9920.36] + - [454, 9920.36] - - [4096, 1024, 1, 3485] - - [320, 9861.64] + - [454, 9861.64] - - [1024, 1024, 1, 3681] - - [324, 8991.46] + - [458, 8991.46] - - [4096, 1024, 1, 3303] - - [327, 9861.3] + - [461, 9861.3] - - [4096, 1024, 1, 3344] - - [327, 9892.44] + - [461, 9892.44] - - [1024, 4096, 1, 3479] - - [327, 9921.77] + - [461, 9921.77] - - [4096, 1024, 1, 3300] - - [326, 9868.64] + - [460, 9868.64] - - [1024, 4096, 1, 3439] - - [320, 9918.29] + - [454, 9918.29] - - [4096, 1024, 1, 3280] - - [327, 9875.29] + - [461, 9875.29] - - [1024, 4096, 1, 3245] - - [319, 9910.49] + - [453, 9910.49] - - [1024, 4096, 1, 3328] - - [319, 9941.6] + - [453, 9941.6] - - [4096, 1024, 1, 3418] - - [319, 9870.76] + - [453, 9870.76] - - [1024, 4096, 1, 3493] - - [327, 9938.45] + - [461, 9938.45] - - [1024, 4096, 1, 3500] - - [319, 9916.93] + - [453, 9916.93] - - [1024, 4096, 1, 3166] - - [319, 9898.12] + - [453, 9898.12] - - [4096, 1024, 1, 3126] - - [320, 9847.04] + - [454, 9847.04] - - [1024, 4096, 1, 3277] - - [327, 9898.66] + - [461, 9898.66] - - [1024, 4096, 1, 3315] - - [326, 9923.11] + - [460, 9923.11] - - [1024, 1024, 1, 3927] - - [317, 8987.71] + - [451, 8987.71] - - [1024, 4096, 1, 3414] - - [319, 9916.01] + - [453, 9916.01] - - [4096, 1024, 1, 3531] - - [326, 9871.92] + - [460, 9871.92] - - [4096, 1024, 1, 3484] - - [319, 9867.86] + - [453, 9867.86] - - [1024, 4096, 1, 3180] - - [326, 9904.09] + - [460, 9904.09] - - [4096, 1024, 1, 3360] - - [326, 9879.57] + - [460, 9879.57] - - [1024, 33708, 1, 3990] - - [319, 10335.0] + - [453, 10335.0] - - [4096, 1024, 1, 3466] - - [326, 9875.02] + - [460, 9875.02] - - [1024, 4096, 1, 3428] - - [319, 9916.02] + - [453, 9916.02] - - [1024, 4096, 1, 3137] - - [326, 9913.27] + - [460, 9913.27] - - [4096, 1024, 1, 4059] - - [326, 9901.86] + - [460, 9901.86] - - [1024, 4096, 1, 3353] - - [326, 9914.6] + - [460, 9914.6] - - [1024, 4096, 1, 3942] - - [326, 9944.5] + - [460, 9944.5] - - [4096, 1024, 1, 3506] - - [319, 9875.75] + - [453, 9875.75] - - [1024, 1024, 1, 3894] - - [317, 8946.55] + - [451, 8946.55] - - [4096, 1024, 1, 3508] - - [327, 9877.67] + - [461, 9877.67] - - [64, 132, 480, 135] - - [310, 6164.86] + - [444, 6164.86] - - [4096, 1024, 1, 3956] - - [319, 9907.83] + - [453, 9907.83] - - [64, 7, 8192, 7] - - [298, 813.078] + - [432, 813.078] - - [1024, 4096, 1, 3272] - - [320, 9909.82] + - [454, 9909.82] - - [1024, 4096, 1, 3443] - - [327, 9929.83] + - [461, 9929.83] - - [1024, 4096, 1, 3375] - - [327, 9909.23] + - [461, 9909.23] - - [1024, 4096, 1, 3525] - - [327, 9929.27] + - [461, 9929.27] - - [4096, 1024, 1, 3472] - - [326, 9889.97] + - [460, 9889.97] - - [1024, 4096, 1, 3520] - - [319, 9947.79] + - [453, 9947.79] - - [4096, 1024, 1, 3322] - - [326, 9862.98] + - [460, 9862.98] - - [4096, 1024, 1, 3387] - - [326, 9861.62] + - [460, 9861.62] - - [64, 8, 7280, 8] - - [304, 1024.1] + - [438, 1024.1] - - [1024, 33708, 1, 3939] - - [319, 10339.9] + - [453, 10339.9] - - [4096, 1024, 1, 3345] - - [327, 9873.68] + - [461, 9873.68] - - [4096, 1024, 1, 2967] - - [326, 9839.21] + - [460, 9839.21] - - [1024, 4096, 1, 3453] - - [319, 9905.81] + - [453, 9905.81] - - [1024, 4096, 1, 3640] - - [326, 9934.05] + - [460, 9934.05] - - [4096, 1024, 1, 3291] - - [320, 9860.84] + - [454, 9860.84] - - [1024, 4096, 1, 3350] - - [327, 9918.03] + - [461, 9918.03] - - [4096, 1024, 1, 3417] - - [326, 9864.61] + - [460, 9864.61] - - [64, 135, 480, 135] - - [310, 6265.45] + - [444, 6265.45] - - [1024, 4096, 1, 3467] - - [320, 9906.95] + - [454, 9906.95] - - [1024, 4096, 1, 3491] - - [326, 9933.3] + - [460, 9933.3] - - [1024, 4096, 1, 3822] - - [326, 9938.75] + - [460, 9938.75] - - [4096, 1024, 1, 3292] - - [326, 9849.21] + - [460, 9849.21] - - [1024, 4096, 1, 3231] - - [319, 9905.82] + - [453, 9905.82] - - [1024, 4096, 1, 3364] - - [320, 9930.32] + - [454, 9930.32] - - [1024, 4096, 1, 3995] - - [320, 9943.76] + - [454, 9943.76] - - [1024, 4096, 1, 3545] - - [319, 9928.53] + - [453, 9928.53] - - [1024, 1024, 1, 3876] - - [317, 9003.04] + - [451, 9003.04] - - [1024, 4096, 1, 3186] - - [319, 9921.01] + - [453, 9921.01] - - [4096, 1024, 1, 3432] - - [326, 9875.29] + - [460, 9875.29] - - [64, 84, 752, 85] - - [297, 5704.51] + - [431, 5704.51] - - [4096, 1024, 1, 3367] - - [320, 9868.06] + - [454, 9868.06] - - [4096, 1024, 1, 3503] - - [327, 9871.01] + - [461, 9871.01] - - [1024, 4096, 1, 3095] - - [320, 9902.9] + - [454, 9902.9] - - [4096, 1024, 1, 3465] - - [327, 9872.17] + - [461, 9872.17] - - [1024, 4096, 1, 3402] - - [326, 9914.66] + - [460, 9914.66] - - [4096, 1024, 1, 3140] - - [326, 9847.95] + - [460, 9847.95] - - [1024, 1024, 1, 4050] - - [323, 9055.75] + - [457, 9055.75] - - [4096, 1024, 1, 3424] - - [320, 9894.62] + - [454, 9894.62] - - [4096, 1024, 1, 3257] - - [319, 9860.97] + - [453, 9860.97] - - [4096, 1024, 1, 2917] - - [326, 9845.91] + - [460, 9845.91] - - [1024, 33708, 1, 3640] - - [319, 10321.7] + - [453, 10321.7] - - [1024, 4096, 1, 3456] - - [319, 9950.35] + - [453, 9950.35] - - [1024, 4096, 1, 3014] - - [319, 9907.97] + - [453, 9907.97] - - [4096, 1024, 1, 3372] - - [327, 9868.37] + - [461, 9868.37] - - [64, 132, 480, 132] - - [310, 6121.62] + - [444, 6121.62] - - [1024, 4096, 1, 3294] - - [327, 9903.23] + - [461, 9903.23] - - [4096, 1024, 1, 3446] - - [327, 9871.69] + - [461, 9871.69] - - [1024, 4096, 1, 3389] - - [320, 9909.27] + - [454, 9909.27] - - [4096, 1024, 1, 3259] - - [326, 9860.76] + - [460, 9860.76] - - [4096, 1024, 1, 3544] - - [326, 9878.76] + - [460, 9878.76] - - [4096, 1024, 1, 3479] - - [327, 9873.97] + - [461, 9873.97] - - [4096, 1024, 1, 3542] - - [326, 9878.97] + - [460, 9878.97] - - [4096, 1024, 1, 3321] - - [319, 9861.13] + - [453, 9861.13] - - [1024, 4096, 1, 3147] - - [319, 9894.77] + - [453, 9894.77] - - [1024, 4096, 1, 3944] - - [319, 9950.51] + - [453, 9950.51] - - [4096, 1024, 1, 3870] - - [327, 9881.74] + - [461, 9881.74] - - [1024, 4096, 1, 3308] - - [319, 9907.26] + - [453, 9907.26] - - [4096, 1024, 1, 3401] - - [326, 9864.59] + - [460, 9864.59] - - [1024, 4096, 1, 3395] - - [319, 9929.03] + - [453, 9929.03] - - [64, 99, 624, 102] - - [295, 5651.36] + - [429, 5651.36] - - [1024, 4096, 1, 3563] - - [326, 9922.76] + - [460, 9922.76] - - [1024, 33708, 1, 3870] - - [319, 10325.4] + - [453, 10325.4] - - [4096, 1024, 1, 3494] - - [326, 9875.37] + - [460, 9875.37] - - [1024, 4096, 1, 3271] - - [319, 9913.09] + - [453, 9913.09] - - [1024, 33708, 1, 3910] - - [319, 10341.5] + - [453, 10341.5] - - [1024, 4096, 1, 3287] - - [327, 9924.87] + - [461, 9924.87] - - [1024, 33708, 1, 3860] - - [319, 10330.7] + - [453, 10330.7] - - [64, 143, 432, 148] - - [312, 6571.78] + - [446, 6571.78] - - [1024, 1024, 1, 3584] - - [324, 8975.31] + - [458, 8975.31] - - [64, 162, 400, 162] - - [314, 6822.26] + - [448, 6822.26] - - [4096, 1024, 1, 3341] - - [326, 9854.66] + - [460, 9854.66] - - [1024, 4096, 1, 3136] - - [319, 9926.86] + - [453, 9926.86] - - [4096, 1024, 1, 3439] - - [326, 9854.33] + - [460, 9854.33] - - [64, 148, 432, 147] - - [310, 6677.61] + - [444, 6677.61] - - [1024, 4096, 1, 3751] - - [326, 9938.48] + - [460, 9938.48] - - [1024, 4096, 1, 3301] - - [326, 9919.15] + - [460, 9919.15] - - [4096, 1024, 1, 3468] - - [327, 9859.83] + - [461, 9859.83] - - [1024, 4096, 1, 3416] - - [327, 9918.52] + - [461, 9918.52] - - [4096, 1024, 1, 3163] - - [326, 9854.65] + - [460, 9854.65] - - [1024, 4096, 1, 3230] - - [320, 9897.54] + - [454, 9897.54] - - [1024, 4096, 1, 3581] - - [320, 9915.48] + - [454, 9915.48] - - [1024, 1024, 1, 3960] - - [322, 9045.86] + - [456, 9045.86] - - [4096, 1024, 1, 3463] - - [327, 9884.74] + - [461, 9884.74] - - [1024, 4096, 1, 3478] - - [320, 9927.02] + - [454, 9927.02] - - [4096, 1024, 1, 3262] - - [326, 9852.22] + - [460, 9852.22] - - [1024, 4096, 1, 3438] - - [326, 9912.68] + - [460, 9912.68] - - [1024, 4096, 1, 3244] - - [319, 9900.51] + - [453, 9900.51] - - [1024, 4096, 1, 3445] - - [319, 9920.32] + - [453, 9920.32] - - [4096, 1024, 1, 3328] - - [326, 9888.07] + - [460, 9888.07] - - [1024, 4096, 1, 3492] - - [320, 9937.22] + - [454, 9937.22] - - [4096, 1024, 1, 3211] - - [320, 9847.95] + - [454, 9847.95] - - [1024, 4096, 1, 3910] - - [327, 9946.57] + - [461, 9946.57] - - [1024, 4096, 1, 3314] - - [319, 9932.6] + - [453, 9932.6] - - [4096, 1024, 1, 3859] - - [326, 9902.84] + - [460, 9902.84] - - [4096, 1024, 1, 3383] - - [326, 9875.2] + - [460, 9875.2] - - [1024, 4096, 1, 3409] - - [327, 9926.79] + - [461, 9926.79] - - [1024, 4096, 1, 4020] - - [319, 9941.8] + - [453, 9941.8] - - [4096, 1024, 1, 3530] - - [326, 9872.81] + - [460, 9872.81] - - [4096, 1024, 1, 3411] - - [327, 9875.02] + - [461, 9875.02] - - [1024, 4096, 1, 3566] - - [327, 9921.1] + - [461, 9921.1] - - [4096, 1024, 1, 3493] - - [319, 9875.74] + - [453, 9875.74] - - [4096, 1024, 1, 3184] - - [326, 9873.14] + - [460, 9873.14] - - [1024, 4096, 1, 3072] - - [319, 9923.79] + - [453, 9923.79] - - [1024, 4096, 1, 3431] - - [320, 9911.03] + - [454, 9911.03] - - [4096, 1024, 1, 3306] - - [327, 9853.42] + - [461, 9853.42] - - [1024, 4096, 1, 3352] - - [327, 9913.32] + - [461, 9913.32] - - [4096, 1024, 1, 3295] - - [326, 9862.68] + - [460, 9862.68] - - [64, 123, 528, 122] - - [292, 6950.25] + - [426, 6950.25] - - [1024, 4096, 1, 3517] - - [320, 9920.06] + - [454, 9920.06] - - [64, 102, 624, 101] - - [300, 5791.49] + - [434, 5791.49] - - [4096, 1024, 1, 3426] - - [326, 9891.14] + - [460, 9891.14] - - [4096, 1024, 1, 3385] - - [326, 9868.41] + - [460, 9868.41] - - [1024, 1024, 1, 3978] - - [317, 9008.48] + - [451, 9008.48] - - [4096, 1024, 1, 3572] - - [319, 9884.81] + - [453, 9884.81] - - [4096, 1024, 1, 3459] - - [326, 9892.17] + - [460, 9892.17] - - [1024, 4096, 1, 3374] - - [327, 9908.52] + - [461, 9908.52] - - [4096, 1024, 1, 3166] - - [326, 9832.45] + - [460, 9832.45] - - [4096, 1024, 1, 3093] - - [327, 9841.25] + - [461, 9841.25] - - [4096, 1024, 1, 3523] - - [320, 9879.05] + - [454, 9879.05] - - [4096, 1024, 1, 3413] - - [320, 9880.81] + - [454, 9880.81] - - [1024, 4096, 1, 3996] - - [319, 9948.14] + - [453, 9948.14] - - [1024, 4096, 1, 3452] - - [327, 9915.97] + - [461, 9915.97] - - [4096, 1024, 1, 3232] - - [327, 9876.54] + - [461, 9876.54] - - [4096, 1024, 1, 3400] - - [319, 9867.15] + - [453, 9867.15] - - [4096, 1024, 1, 3334] - - [326, 9868.99] + - [460, 9868.99] - - [1024, 4096, 1, 3345] - - [319, 9920.6] + - [453, 9920.6] - - [1024, 4096, 1, 3538] - - [326, 9933.34] + - [460, 9933.34] - - [1024, 4096, 1, 3466] - - [326, 9920.85] + - [460, 9920.85] - - [4096, 1024, 1, 3315] - - [326, 9876.87] + - [460, 9876.87] - - [4096, 1024, 1, 3214] - - [327, 9847.93] + - [461, 9847.93] - - [1024, 33708, 1, 3900] - - [319, 10331.7] + - [453, 10331.7] - - [64, 160, 400, 160] - - [312, 7440.61] + - [446, 7440.61] - - [1024, 4096, 1, 3367] - - [326, 9926.32] + - [460, 9926.32] - - [1024, 4096, 1, 2917] - - [327, 9904.57] + - [461, 9904.57] - - [1024, 1024, 1, 3995] - - [318, 9000.33] + - [452, 9000.33] - - [64, 132, 480, 134] - - [310, 6146.88] + - [444, 6146.88] - - [1024, 4096, 1, 3544] - - [327, 9924.14] + - [461, 9924.14] - - [4096, 1024, 1, 3414] - - [327, 9867.9] + - [461, 9867.9] - - [4096, 1024, 1, 3565] - - [320, 9870.13] + - [454, 9870.13] - - [1024, 4096, 1, 3512] - - [326, 9919.84] + - [460, 9919.84] - - [1024, 4096, 1, 3191] - - [327, 9914.79] + - [461, 9914.79] - - [64, 27, 2336, 27] - - [294, 3054.71] + - [428, 3054.71] - - [1024, 4096, 1, 3289] - - [327, 9917.2] + - [461, 9917.2] - - [4096, 1024, 1, 3290] - - [326, 9858.41] + - [460, 9858.41] - - [1024, 4096, 1, 3211] - - [327, 9897.16] + - [461, 9897.16] - - [1024, 33708, 1, 3969] - - [320, 10336.1] + - [454, 10336.1] - - [4096, 1024, 1, 3566] - - [326, 9863.0] + - [460, 9863.0] - - [64, 111, 576, 111] - - [300, 6400.91] + - [434, 6400.91] - - [1024, 4096, 1, 3459] - - [326, 9923.03] + - [460, 9923.03] - - [1024, 4096, 1, 3372] - - [319, 9909.86] + - [453, 9909.86] - - [4096, 1024, 1, 3339] - - [326, 9859.3] + - [460, 9859.3] - - [4096, 1024, 1, 3425] - - [326, 9889.34] + - [460, 9889.34] - - [4096, 1024, 1, 3388] - - [326, 9871.67] + - [460, 9871.67] - - [1024, 4096, 1, 3531] - - [319, 9919.0] + - [453, 9919.0] - - [4096, 1024, 1, 3286] - - [327, 9868.42] + - [461, 9868.42] - - [4096, 1024, 1, 3462] - - [326, 9881.88] + - [460, 9881.88] - - [1024, 4096, 1, 3388] - - [319, 9904.69] + - [453, 9904.69] - - [4096, 1024, 1, 3165] - - [319, 9836.33] + - [453, 9836.33] - - [4096, 1024, 1, 3304] - - [326, 9857.55] + - [460, 9857.55] - - [1024, 4096, 1, 2736] - - [326, 9901.07] + - [460, 9901.07] - - [4096, 1024, 1, 3397] - - [326, 9872.1] + - [460, 9872.1] - - [64, 38, 1680, 38] - - [293, 3459.52] + - [427, 3459.52] - - [1024, 4096, 1, 3311] - - [327, 9908.32] + - [461, 9908.32] - - [1024, 4096, 1, 3394] - - [327, 9929.43] + - [461, 9929.43] - - [4096, 1024, 1, 2736] - - [326, 9833.88] + - [460, 9833.88] - - [1024, 4096, 1, 3559] - - [320, 9925.33] + - [454, 9925.33] - - [4096, 1024, 1, 3180] - - [326, 9838.05] + - [460, 9838.05] - - [1024, 4096, 1, 3480] - - [319, 9922.46] + - [453, 9922.46] - - [4096, 1024, 1, 3318] - - [326, 9867.87] + - [460, 9867.87] - - [4096, 1024, 1, 3213] - - [326, 9846.02] + - [460, 9846.02] - - [1024, 4096, 1, 3286] - - [326, 9912.14] + - [460, 9912.14] - - [4096, 1024, 1, 3471] - - [326, 9874.24] + - [460, 9874.24] - - [1024, 4096, 1, 3381] - - [327, 9922.96] + - [461, 9922.96] - - [64, 100, 624, 100] - - [301, 5705.24] + - [435, 5705.24] - - [4096, 1024, 1, 3502] - - [326, 9872.44] + - [460, 9872.44] - - [64, 16, 3840, 16] - - [307, 2091.67] + - [441, 2091.67] - - [1024, 4096, 1, 3552] - - [319, 9943.89] + - [453, 9943.89] - - [4096, 1024, 1, 3519] - - [327, 9869.95] + - [461, 9869.95] - - [1024, 4096, 1, 3300] - - [320, 9916.15] + - [454, 9916.15] - - [1024, 4096, 1, 3419] - - [319, 9914.06] + - [453, 9914.06] - - [4096, 1024, 1, 4030] - - [320, 9893.73] + - [454, 9893.73] - - [4096, 1024, 1, 3976] - - [327, 9898.35] + - [461, 9898.35] - - [1024, 4096, 1, 3473] - - [327, 9928.42] + - [461, 9928.42] - - [1024, 1024, 1, 3977] - - [324, 9009.33] + - [458, 9009.33] - - [4096, 1024, 1, 3428] - - [326, 9876.79] + - [460, 9876.79] - - [1024, 4096, 1, 3433] - - [320, 9923.92] + - [454, 9923.92] - - [4096, 1024, 1, 3534] - - [320, 9864.0] + - [454, 9864.0] - - [4096, 1024, 1, 3461] - - [326, 9873.12] + - [460, 9873.12] - - [4096, 1024, 1, 3681] - - [326, 9898.57] + - [460, 9898.57] - - [4096, 1024, 1, 3495] - - [327, 9876.08] + - [461, 9876.08] - - [4096, 1024, 1, 3351] - - [326, 9879.71] + - [460, 9879.71] - - [1024, 4096, 1, 4059] - - [319, 9948.61] + - [453, 9948.61] - - [4096, 1024, 1, 3990] - - [326, 9900.76] + - [460, 9900.76] - - [1024, 4096, 1, 3325] - - [320, 9903.3] + - [454, 9903.3] - - [1024, 4096, 1, 3408] - - [326, 9932.15] + - [460, 9932.15] - - [64, 59, 1088, 59] - - [300, 5343.77] + - [434, 5343.77] - - [4096, 1024, 1, 3394] - - [327, 9878.17] + - [461, 9878.17] - - [1024, 4096, 1, 3573] - - [327, 9935.3] + - [461, 9935.3] - - [4096, 1024, 1, 3386] - - [326, 9866.38] + - [460, 9866.38] - - [4096, 1024, 1, 3540] - - [326, 9882.33] + - [460, 9882.33] - - [1024, 4096, 1, 3182] - - [320, 9894.45] + - [454, 9894.45] - - [1024, 4096, 1, 3430] - - [319, 9915.24] + - [453, 9915.24] - - [1024, 4096, 1, 3236] - - [327, 9920.56] + - [461, 9920.56] - - [4096, 1024, 1, 2977] - - [326, 9848.08] + - [460, 9848.08] - - [1024, 4096, 1, 3355] - - [326, 9908.78] + - [460, 9908.78] - - [4096, 1024, 1, 3139] - - [326, 9850.71] + - [460, 9850.71] - - [4096, 1024, 1, 3516] - - [320, 9874.21] + - [454, 9874.21] - - [4096, 1024, 1, 3368] - - [320, 9872.64] + - [454, 9872.64] - - [4096, 1024, 1, 3559] - - [319, 9884.32] + - [453, 9884.32] - - [64, 11, 5456, 11] - - [307, 1382.67] + - [441, 1382.67] - - [1024, 4096, 1, 3506] - - [326, 9937.69] + - [460, 9937.69] - - [1024, 4096, 1, 3145] - - [319, 9905.11] + - [453, 9905.11] - - [1024, 4096, 1, 3369] - - [326, 9912.71] + - [460, 9912.71] - - [64, 112, 576, 112] - - [292, 6583.56] + - [426, 6583.56] - - [4096, 1024, 1, 3522] - - [326, 9889.47] + - [460, 9889.47] - - [1024, 33708, 1, 3894] - - [319, 10337.5] + - [453, 10337.5] - - [64, 159, 400, 162] - - [310, 7057.09] + - [444, 7057.09] - - [4096, 1024, 1, 3336] - - [326, 9867.67] + - [460, 9867.67] - - [1024, 4096, 1, 3382] - - [320, 9915.9] + - [454, 9915.9] - - [4096, 1024, 1, 3533] - - [326, 9878.56] + - [460, 9878.56] - - [4096, 1024, 1, 4050] - - [327, 9916.82] + - [461, 9916.82] - - [4096, 1024, 1, 3480] - - [320, 9869.32] + - [454, 9869.32] - - [1024, 4096, 1, 3344] - - [319, 9935.61] + - [453, 9935.61] - - [64, 122, 528, 122] - - [292, 6871.14] + - [426, 6871.14] - - [1024, 4096, 1, 3509] - - [320, 9925.8] + - [454, 9925.8] - - [1024, 4096, 1, 3956] - - [319, 9958.26] + - [453, 9958.26] - - [4096, 1024, 1, 3616] - - [326, 9904.63] + - [460, 9904.63] - - [1024, 4096, 1, 3366] - - [319, 9919.47] + - [453, 9919.47] - - [4096, 1024, 1, 2935] - - [319, 9833.23] + - [453, 9833.23] - - [4096, 1024, 1, 3393] - - [326, 9877.45] + - [460, 9877.45] - - [4096, 1024, 1, 3547] - - [320, 9865.1] + - [454, 9865.1] - - [1024, 4096, 1, 3499] - - [327, 9912.49] + - [461, 9912.49] - - [4096, 1024, 1, 3357] - - [326, 9855.28] + - [460, 9855.28] - - [4096, 1024, 1, 3272] - - [326, 9861.97] + - [460, 9861.97] - - [4096, 1024, 1, 3207] - - [326, 9847.78] + - [460, 9847.78] - - [4096, 1024, 1, 3894] - - [326, 9918.86] + - [460, 9918.86] - - [1024, 4096, 1, 3444] - - [326, 9932.71] + - [460, 9932.71] - - [4096, 1024, 1, 3561] - - [326, 9872.61] + - [460, 9872.61] - - [4096, 1024, 1, 3376] - - [326, 9885.59] + - [460, 9885.59] - - [1024, 4096, 1, 3458] - - [326, 9929.39] + - [460, 9929.39] - - [4096, 1024, 1, 3231] - - [320, 9847.08] + - [454, 9847.08] - - [64, 228, 272, 228] - - [321, 7302.69] + - [455, 7302.69] - - [1024, 4096, 1, 3505] - - [327, 9931.63] + - [461, 9931.63] - - [4096, 1024, 1, 3277] - - [326, 9857.2] + - [460, 9857.2] - - [64, 21, 2976, 21] - - [296, 2436.14] + - [430, 2436.14] - - [1024, 4096, 1, 3391] - - [326, 9911.25] + - [460, 9911.25] - - [64, 32, 1984, 32] - - [308, 3572.17] + - [442, 3572.17] - - [1024, 4096, 1, 3536] - - [327, 9946.9] + - [461, 9946.9] - - [1024, 4096, 1, 3063] - - [326, 9906.92] + - [460, 9906.92] - - [1024, 1024, 1, 3925] - - [318, 9011.45] + - [452, 9011.45] - - [1024, 4096, 1, 3189] - - [320, 9900.95] + - [454, 9900.95] - - [1024, 4096, 1, 2505] - - [326, 9854.85] + - [460, 9854.85] - - [4096, 1024, 1, 3454] - - [319, 9864.96] + - [453, 9864.96] - - [1024, 4096, 1, 3405] - - [327, 9906.33] + - [461, 9906.33] - - [1024, 33708, 1, 4050] - - [320, 10343.7] + - [454, 10343.7] - - [4096, 1024, 1, 3520] - - [326, 9887.03] + - [460, 9887.03] - - [64, 93, 688, 93] - - [303, 6222.86] + - [437, 6222.86] - - [1024, 4096, 1, 3487] - - [327, 9918.69] + - [461, 9918.69] - - [1024, 4096, 1, 3558] - - [327, 9930.99] + - [461, 9930.99] - - [4096, 1024, 1, 3297] - - [326, 9874.31] + - [460, 9874.31] - - [1024, 1024, 1, 3840] - - [322, 9075.42] + - [456, 9075.42] - - [1024, 4096, 1, 3483] - - [326, 9915.38] + - [460, 9915.38] - - [1024, 1024, 1, 3956] - - [325, 9010.03] + - [459, 9010.03] - - [1024, 33708, 1, 3751] - - [320, 10325.9] + - [454, 10325.9] - - [4096, 1024, 1, 3380] - - [326, 9888.47] + - [460, 9888.47] - - [1024, 4096, 1, 3380] - - [319, 9927.25] + - [453, 9927.25] - - [1024, 4096, 1, 3396] - - [327, 9931.96] + - [461, 9931.96] - - [1024, 4096, 1, 3497] - - [320, 9914.86] + - [454, 9914.86] - - [1024, 4096, 1, 3502] - - [327, 9921.52] + - [461, 9921.52] - - [1024, 1024, 1, 3976] - - [322, 9060.3] + - [456, 9060.3] - - [1024, 4096, 1, 3138] - - [320, 9908.66] + - [454, 9908.66] - - [4096, 1024, 1, 3939] - - [319, 9910.23] + - [453, 9910.23] - - [1024, 4096, 1, 3303] - - [320, 9916.64] + - [454, 9916.64] - - [64, 111, 576, 112] - - [300, 6495.19] + - [434, 6495.19] - - [1024, 4096, 1, 3418] - - [326, 9913.35] + - [460, 9913.35] - - [1024, 4096, 1, 3224] - - [320, 9904.05] + - [454, 9904.05] - - [4096, 1024, 1, 3978] - - [326, 9896.28] + - [460, 9896.28] - - [1024, 4096, 1, 3472] - - [319, 9937.48] + - [453, 9937.48] - - [4096, 1024, 1, 3353] - - [327, 9863.97] + - [461, 9863.97] - - [4096, 1024, 1, 3362] - - [326, 9871.06] + - [460, 9871.06] - - [1024, 33708, 1, 3978] - - [319, 10325.4] + - [453, 10325.4] - - [64, 100, 624, 102] - - [295, 5695.67] + - [429, 5695.67] - - [1024, 4096, 1, 3432] - - [327, 9915.56] + - [461, 9915.56] - - [1024, 4096, 1, 3139] - - [326, 9914.21] + - [460, 9914.21] - - [1024, 4096, 1, 3341] - - [327, 9912.1] + - [461, 9912.1] - - [1024, 4096, 1, 3494] - - [320, 9924.6] + - [454, 9924.6] - - [1024, 4096, 1, 3969] - - [319, 9952.28] + - [453, 9952.28] - - [1024, 4096, 1, 3163] - - [327, 9911.79] + - [461, 9911.79] - - [1024, 1024, 1, 3955] - - [317, 9097.86] + - [451, 9097.86] - - [4096, 1024, 1, 3405] - - [326, 9853.84] + - [460, 9853.84] - - [1024, 1024, 1, 4030] - - [317, 9083.86] + - [451, 9083.86] - - [4096, 1024, 1, 3453] - - [326, 9858.88] + - [460, 9858.88] - - [1024, 4096, 1, 3411] - - [327, 9926.54] + - [461, 9926.54] - - [1024, 4096, 1, 3527] - - [320, 9922.65] + - [454, 9922.65] - - [4096, 1024, 1, 3474] - - [326, 9878.49] + - [460, 9878.49] - - [1024, 4096, 1, 3572] - - [326, 9932.0] + - [460, 9932.0] - - [4096, 1024, 1, 3293] - - [326, 9848.26] + - [460, 9848.26] - - [4096, 1024, 1, 3247] - - [326, 9861.45] + - [460, 9861.45] - - [64, 15, 4096, 15] - - [307, 1955.75] + - [441, 1955.75] - - [1024, 4096, 1, 3425] - - [327, 9936.4] + - [461, 9936.4] - - [1024, 4096, 1, 3354] - - [319, 9917.55] + - [453, 9917.55] - - [4096, 1024, 1, 3382] - - [326, 9885.49] + - [460, 9885.49] - - [4096, 1024, 1, 3236] - - [326, 9860.6] + - [460, 9860.6] - - [1024, 4096, 1, 3519] - - [327, 9919.3] + - [461, 9919.3] - - [4096, 1024, 1, 3354] - - [326, 9854.75] + - [460, 9854.75] - - [4096, 1024, 1, 3501] - - [327, 9869.62] + - [461, 9869.62] - - [1024, 1024, 1, 3906] - - [325, 9104.99] + - [459, 9104.99] - - [4096, 1024, 1, 3266] - - [326, 9873.97] + - [460, 9873.97] - - [64, 101, 624, 102] - - [295, 5765.52] + - [429, 5765.52] - - [1024, 4096, 1, 3368] - - [326, 9909.77] + - [460, 9909.77] - - [1024, 4096, 1, 4030] - - [327, 9940.27] + - [461, 9940.27] - - [1024, 4096, 1, 3533] - - [320, 9916.64] + - [454, 9916.64] - - [4096, 1024, 1, 3332] - - [327, 9876.45] + - [461, 9876.45] - - [4096, 1024, 1, 3584] - - [326, 9896.6] + - [460, 9896.6] - - [1024, 4096, 1, 3616] - - [326, 9957.18] + - [460, 9957.18] - - [4096, 1024, 1, 3265] - - [326, 9877.78] + - [460, 9877.78] - - [4096, 1024, 1, 3361] - - [326, 9888.61] + - [460, 9888.61] - - [4096, 1024, 1, 3467] - - [326, 9863.4] + - [460, 9863.4] - - [1024, 4096, 1, 3454] - - [320, 9904.89] + - [454, 9904.89] - - [1024, 4096, 1, 3101] - - [327, 9893.12] + - [461, 9893.12] - - [1024, 4096, 1, 3508] - - [327, 9931.54] + - [461, 9931.54] - - [4096, 1024, 1, 3267] - - [326, 9864.48] + - [460, 9864.48] - - [64, 54, 1184, 54] - - [292, 4906.02] + - [426, 4906.02] - - [4096, 1024, 1, 3419] - - [326, 9872.56] + - [460, 9872.56] - - [4096, 1024, 1, 3822] - - [326, 9892.63] + - [460, 9892.63] - - [1024, 4096, 1, 3266] - - [326, 9918.58] + - [460, 9918.58] - - [4096, 1024, 1, 3440] - - [327, 9890.16] + - [461, 9890.16] - - [1024, 4096, 1, 3361] - - [326, 9930.97] + - [460, 9930.97] - - [1024, 4096, 1, 3546] - - [320, 9926.56] + - [454, 9926.56] - - [4096, 1024, 1, 3473] - - [326, 9889.06] + - [460, 9889.06] - - [4096, 1024, 1, 3546] - - [327, 9872.27] + - [461, 9872.27] - - [1024, 4096, 1, 3088] - - [320, 9918.03] + - [454, 9918.03] - - [1024, 4096, 1, 3535] - - [327, 9921.2] + - [461, 9921.2] - - [1024, 4096, 1, 3447] - - [327, 9920.63] + - [461, 9920.63] - - [1024, 4096, 1, 3560] - - [326, 9925.48] + - [460, 9925.48] - - [1024, 4096, 1, 3422] - - [320, 9922.21] + - [454, 9922.21] - - [1024, 4096, 1, 3469] - - [319, 9906.18] + - [453, 9906.18] - - [4096, 1024, 1, 3488] - - [326, 9903.26] + - [460, 9903.26] - - [1024, 4096, 1, 3110] - - [326, 9906.76] + - [460, 9906.76] - - [1024, 4096, 1, 3265] - - [327, 9916.69] + - [461, 9916.69] - - [1024, 4096, 1, 3291] - - [326, 9902.73] + - [460, 9902.73] - - [1024, 4096, 1, 3390] - - [327, 9907.22] + - [461, 9907.22] - - [4096, 1024, 1, 3046] - - [326, 9847.68] + - [460, 9847.68] - - [1024, 4096, 1, 3539] - - [327, 9933.49] + - [461, 9933.49] - - [4096, 1024, 1, 3221] - - [327, 9860.74] + - [461, 9860.74] - - [4096, 1024, 1, 3433] - - [326, 9872.74] + - [460, 9872.74] - - [4096, 1024, 1, 3364] - - [327, 9881.91] + - [461, 9881.91] - - [4096, 1024, 1, 3470] - - [326, 9858.56] + - [460, 9858.56] - - [1024, 4096, 1, 3404] - - [319, 9907.27] + - [453, 9907.27] - - [1024, 33708, 1, 3968] - - [320, 10350.3] + - [454, 10350.3] - - [4096, 1024, 1, 3088] - - [326, 9869.06] + - [460, 9869.06] - - [1024, 4096, 1, 3247] - - [326, 9901.02] + - [460, 9901.02] - - [1024, 33708, 1, 3996] - - [319, 10328.5] + - [453, 10328.5] - - [4096, 1024, 1, 3482] - - [327, 9866.99] + - [461, 9866.99] - - [1024, 1024, 1, 3796] - - [322, 9031.68] + - [456, 9031.68] - - [4096, 1024, 1, 3995] - - [327, 9896.78] + - [461, 9896.78] - - [1024, 1024, 1, 3859] - - [324, 9097.36] + - [458, 9097.36] - - [1024, 4096, 1, 3280] - - [320, 9934.05] + - [454, 9934.05] - - [4096, 1024, 1, 3271] - - [327, 9860.09] + - [461, 9860.09] - - [64, 10, 5952, 10] - - [307, 1221.02] + - [441, 1221.02] - - [4096, 1024, 1, 3545] - - [326, 9877.35] + - [460, 9877.35] - - [4096, 1024, 1, 3476] - - [319, 9882.57] + - [453, 9882.57] - - [4096, 1024, 1, 3496] - - [320, 9880.5] + - [454, 9880.5] - - [4096, 1024, 1, 3191] - - [320, 9858.7] + - [454, 9858.7] - - [4096, 1024, 1, 3311] - - [327, 9853.2] + - [461, 9853.2] - - [1024, 4096, 1, 3302] - - [327, 9919.32] + - [461, 9919.32] - - [1024, 4096, 1, 3681] - - [326, 9944.99] + - [460, 9944.99] - - [4096, 1024, 1, 3582] - - [319, 9869.77] + - [453, 9869.77] - - [4096, 1024, 1, 3421] - - [327, 9856.08] + - [461, 9856.08] - - [4096, 1024, 1, 3560] - - [320, 9884.48] + - [454, 9884.48] - - [1024, 4096, 1, 3495] - - [327, 9930.13] + - [461, 9930.13] - - [4096, 1024, 1, 3186] - - [326, 9870.59] + - [460, 9870.59] - - [4096, 1024, 1, 3925] - - [326, 9904.0] + - [460, 9904.0] - - [64, 71, 896, 71] - - [311, 5004.79] + - [445, 5004.79] - - [1024, 4096, 1, 3435] - - [327, 9916.58] + - [461, 9916.58] - - [4096, 1024, 1, 3434] - - [326, 9871.29] + - [460, 9871.29] - - [1024, 33708, 1, 4012] - - [319, 10332.5] + - [453, 10332.5] - - [1024, 4096, 1, 3340] - - [319, 9918.11] + - [453, 9918.11] - - [1024, 1024, 1, 3860] - - [317, 8999.36] + - [451, 8999.36] - - [4096, 1024, 1, 3489] - - [326, 9882.02] + - [460, 9882.02] - - [1024, 4096, 1, 3162] - - [327, 9906.28] + - [461, 9906.28] - - [4096, 1024, 1, 3436] - - [326, 9858.12] + - [460, 9858.12] - - [1024, 1024, 1, 4005] - - [323, 9043.06] + - [457, 9043.06] - - [64, 84, 752, 84] - - [296, 5629.93] + - [430, 5629.93] - - [4096, 1024, 1, 3574] - - [326, 9886.7] + - [460, 9886.7] - - [4096, 1024, 1, 3469] - - [319, 9856.26] + - [453, 9856.26] - - [1024, 4096, 1, 3410] - - [320, 9924.74] + - [454, 9924.74] - - [1024, 4096, 1, 3216] - - [319, 9930.67] + - [453, 9930.67] - - [4096, 1024, 1, 3095] - - [326, 9847.01] + - [460, 9847.01] - - [1024, 1024, 1, 3990] - - [325, 9089.04] + - [459, 9089.04] - - [4096, 1024, 1, 3448] - - [326, 9863.94] + - [460, 9863.94] - - [1024, 4096, 1, 3176] - - [327, 9914.01] + - [461, 9914.01] - - [64, 49, 1296, 49] - - [292, 4437.46] + - [426, 4437.46] - - [4096, 1024, 1, 2918] - - [326, 9830.93] + - [460, 9830.93] - - [64, 14, 4368, 14] - - [306, 1802.47] + - [440, 1802.47] - - [1024, 4096, 1, 3424] - - [326, 9934.05] + - [460, 9934.05] - - [4096, 1024, 1, 3402] - - [319, 9863.12] + - [453, 9863.12] - - [4096, 1024, 1, 3145] - - [320, 9856.56] + - [454, 9856.56] - - [64, 134, 480, 134] - - [312, 6184.05] + - [446, 6184.05] - - [1024, 33708, 1, 3976] - - [320, 10330.1] + - [454, 10330.1] - - [4096, 1024, 1, 3518] - - [319, 9856.07] + - [453, 9856.07] - - [4096, 1024, 1, 3110] - - [326, 9856.46] + - [460, 9856.46] - - [4096, 1024, 1, 3325] - - [326, 9852.36] + - [460, 9852.36] - - [1024, 33708, 1, 3999] - - [319, 10329.7] + - [453, 10329.7] - - [4096, 1024, 1, 2985] - - [326, 9837.3] + - [460, 9837.3] - - [1024, 4096, 1, 3371] - - [319, 9913.03] + - [453, 9913.03] - - [4096, 1024, 1, 3342] - - [326, 9863.16] + - [460, 9863.16] - - [4096, 1024, 1, 3141] - - [320, 9849.91] + - [454, 9849.91] - - [4096, 1024, 1, 3532] - - [320, 9866.3] + - [454, 9866.3] - - [64, 78, 816, 78] - - [297, 5316.88] + - [431, 5316.88] - - [1024, 4096, 1, 3169] - - [327, 9910.45] + - [461, 9910.45] - - [1024, 4096, 1, 3514] - - [326, 9918.0] + - [460, 9918.0] - - [4096, 1024, 1, 3780] - - [327, 9899.75] + - [461, 9899.75] - - [1024, 4096, 1, 3098] - - [319, 9901.62] + - [453, 9901.62] - - [1024, 4096, 1, 3449] - - [327, 9919.85] + - [461, 9919.85] - - [1024, 4096, 1, 3222] - - [319, 9917.66] + - [453, 9917.66] - - [1024, 4096, 1, 3346] - - [320, 9912.91] + - [454, 9912.91] - - [4096, 1024, 1, 3064] - - [327, 9848.79] + - [461, 9848.79] - - [4096, 1024, 1, 3511] - - [326, 9873.39] + - [460, 9873.39] - - [4096, 1024, 1, 3384] - - [326, 9870.98] + - [460, 9870.98] - - [4096, 1024, 1, 3356] - - [320, 9853.45] + - [454, 9853.45] - - [1024, 4096, 1, 3796] - - [319, 9940.66] + - [453, 9940.66] - - [4096, 1024, 1, 3427] - - [326, 9883.14] + - [460, 9883.14] - - [4096, 1024, 1, 3390] - - [326, 9863.79] + - [460, 9863.79] - - [4096, 1024, 1, 3573] - - [327, 9886.02] + - [461, 9886.02] - - [4096, 1024, 1, 3456] - - [320, 9890.61] + - [454, 9890.61] - - [1024, 4096, 1, 3360] - - [327, 9938.1] + - [461, 9938.1] - - [1024, 33708, 1, 3977] - - [320, 10327.2] + - [454, 10327.2] - - [1024, 4096, 1, 2918] - - [319, 9902.84] + - [453, 9902.84] - - [4096, 1024, 1, 3975] - - [326, 9905.27] + - [460, 9905.27] - - [4096, 1024, 1, 3525] - - [327, 9879.91] + - [461, 9879.91] - - [4096, 1024, 1, 3398] - - [319, 9873.91] + - [453, 9873.91] - - [4096, 1024, 1, 3640] - - [326, 9885.16] + - [460, 9885.16] - - [1024, 1024, 1, 3999] - - [318, 8995.42] + - [452, 8995.42] - - [4096, 1024, 1, 3014] - - [326, 9841.32] + - [460, 9841.32] - - [1024, 4096, 1, 3446] - - [319, 9917.21] + - [453, 9917.21] - - [1024, 33708, 1, 3796] - - [319, 10339.0] + - [453, 10339.0] - - [4096, 1024, 1, 3101] - - [319, 9827.34] + - [453, 9827.34] - - [4096, 1024, 1, 3563] - - [327, 9863.03] + - [461, 9863.03] - - [4096, 1024, 1, 3539] - - [319, 9889.54] + - [453, 9889.54] - - [4096, 1024, 1, 3182] - - [326, 9833.79] + - [460, 9833.79] - - [1024, 4096, 1, 3468] - - [320, 9913.05] + - [454, 9913.05] - - [4096, 1024, 1, 3312] - - [326, 9889.85] + - [460, 9889.85] - - [4096, 1024, 1, 3215] - - [326, 9853.88] + - [460, 9853.88] - - [4096, 1024, 1, 3910] - - [326, 9894.72] + - [460, 9894.72] - - [1024, 33708, 1, 3780] - - [320, 10332.0] + - [454, 10332.0] - - [1024, 4096, 1, 3290] - - [326, 9915.08] + - [460, 9915.08] - - [1024, 4096, 1, 4012] - - [326, 9942.65] + - [460, 9942.65] - - [1024, 4096, 1, 3385] - - [326, 9915.83] + - [460, 9915.83] - - [1024, 33708, 1, 3975] - - [319, 10330.1] + - [453, 10330.1] - - [4096, 1024, 1, 3996] - - [326, 9891.31] + - [460, 9891.31] - - [4096, 1024, 1, 2765] - - [327, 9800.38] + - [461, 9800.38] - - [4096, 1024, 1, 3538] - - [327, 9886.22] + - [461, 9886.22] - - [4096, 1024, 1, 3415] - - [327, 9874.6] + - [461, 9874.6] - - [1024, 4096, 1, 3554] - - [326, 9931.99] + - [460, 9931.99] - - [4096, 1024, 1, 3513] - - [320, 9874.25] + - [454, 9874.25] - - [1024, 4096, 1, 3304] - - [320, 9907.73] + - [454, 9907.73] - - [4096, 1024, 1, 3294] - - [326, 9851.25] + - [460, 9851.25] - - [4096, 1024, 1, 3396] - - [327, 9880.7] + - [461, 9880.7] - - [1024, 4096, 1, 3213] - - [320, 9891.12] + - [454, 9891.12] - - [4096, 1024, 1, 3137] - - [320, 9857.41] + - [454, 9857.41] - - [4096, 1024, 1, 3552] - - [326, 9904.22] + - [460, 9904.22] - - [1024, 1024, 1, 4020] - - [325, 9098.87] + - [459, 9098.87] - - [64, 13, 4672, 13] - - [307, 1693.54] + - [441, 1693.54] - - [1024, 4096, 1, 3461] - - [326, 9918.45] + - [460, 9918.45] - - [4096, 1024, 1, 3263] - - [319, 9843.89] + - [453, 9843.89] - - [4096, 1024, 1, 3430] - - [326, 9885.26] + - [460, 9885.26] - - [4096, 1024, 1, 3389] - - [326, 9859.23] + - [460, 9859.23] - - [4096, 1024, 1, 3528] - - [326, 9873.01] + - [460, 9873.01] - - [1024, 4096, 1, 3463] - - [327, 9929.61] + - [461, 9929.61] - - [4096, 1024, 1, 3526] - - [327, 9876.9] + - [461, 9876.9] - - [4096, 1024, 1, 3154] - - [326, 9858.25] + - [460, 9858.25] - - [4096, 1024, 1, 3499] - - [327, 9862.92] + - [461, 9862.92] - - [1024, 1024, 1, 3939] - - [325, 9107.41] + - [459, 9107.41] - - [4096, 1024, 1, 3955] - - [327, 9906.28] + - [461, 9906.28] - - [1024, 4096, 1, 3297] - - [320, 9925.34] + - [454, 9925.34] - - [1024, 4096, 1, 3233] - - [326, 9920.65] + - [460, 9920.65] - - [1024, 4096, 1, 3226] - - [326, 9911.35] + - [460, 9911.35] - - [4096, 1024, 1, 3404] - - [326, 9867.28] + - [460, 9867.28] - - [4096, 1024, 1, 3355] - - [326, 9862.66] + - [460, 9862.66] - - [1024, 4096, 1, 3542] - - [326, 9926.49] + - [460, 9926.49] - - [4096, 1024, 1, 3181] - - [327, 9831.86] + - [461, 9831.86] - - [1024, 4096, 1, 3474] - - [326, 9928.03] + - [460, 9928.03] - - [4096, 1024, 1, 3319] - - [326, 9870.28] + - [460, 9870.28] - - [1024, 4096, 1, 3434] - - [319, 9917.51] + - [453, 9917.51] - - [1024, 4096, 1, 3860] - - [326, 9945.32] + - [460, 9945.32] - - [1024, 4096, 1, 3343] - - [319, 9914.66] + - [453, 9914.66] - - [64, 77, 816, 78] - - [297, 5276.97] + - [431, 5276.97] - - [1024, 4096, 1, 3488] - - [326, 9945.81] + - [460, 9945.81] - - [1024, 4096, 1, 3046] - - [326, 9908.78] + - [460, 9908.78] - - [1024, 4096, 1, 3141] - - [327, 9909.18] + - [461, 9909.18] - - [1024, 4096, 1, 3516] - - [327, 9911.38] + - [461, 9911.38] - - [4096, 1024, 1, 3147] - - [326, 9840.47] + - [460, 9840.47] - - [1024, 1024, 1, 4059] - - [318, 9009.78] + - [452, 9009.78] - - [1024, 1024, 1, 3944] - - [318, 9006.17] + - [452, 9006.17] - - [1024, 4096, 1, 3421] - - [327, 9919.86] + - [461, 9919.86] - - [4096, 1024, 1, 3944] - - [320, 9899.53] + - [454, 9899.53] - - [64, 45, 1424, 45] - - [305, 4068.67] + - [439, 4068.67] - - [1024, 4096, 1, 3574] - - [320, 9930.19] + - [454, 9930.19] - - [1024, 4096, 1, 3977] - - [319, 9944.28] + - [453, 9944.28] - - [1024, 1024, 1, 3968] - - [324, 9045.22] + - [458, 9045.22] - - [1024, 4096, 1, 2985] - - [326, 9887.65] + - [460, 9887.65] - - [64, 193, 320, 193] - - [313, 6631.35] + - [447, 6631.35] - - [1024, 4096, 1, 3427] - - [327, 9933.41] + - [461, 9933.41] - - [64, 12, 5040, 12] - - [307, 1552.53] + - [441, 1552.53] - - [1024, 4096, 1, 3482] - - [327, 9942.22] + - [461, 9942.22] - - [1024, 4096, 1, 3332] - - [319, 9923.58] + - [453, 9923.58] - - [1024, 1024, 1, 3720] - - [323, 9039.56] + - [457, 9039.56] - - [4096, 1024, 1, 3308] - - [327, 9852.66] + - [461, 9852.66] - - [1024, 4096, 1, 3513] - - [327, 9919.99] + - [461, 9919.99] - - [1024, 4096, 1, 3154] - - [320, 9908.46] + - [454, 9908.46] - - [1024, 4096, 1, 3955] - - [327, 9950.01] + - [461, 9950.01] - - [1024, 4096, 1, 2967] - - [327, 9897.44] + - [461, 9897.44] - - [1024, 33708, 1, 3942] - - [319, 10336.1] + - [453, 10336.1] - - [1024, 4096, 1, 3319] - - [327, 9912.45] + - [461, 9912.45] - - [4096, 1024, 1, 3860] - - [326, 9909.29] + - [460, 9909.29] - - [1024, 4096, 1, 3548] - - [319, 9924.21] + - [453, 9924.21] - - [4096, 1024, 1, 3977] - - [327, 9891.44] + - [461, 9891.44] - - [4096, 1024, 1, 3535] - - [326, 9867.84] + - [460, 9867.84] - - [1024, 4096, 1, 3541] - - [327, 9923.16] + - [461, 9923.16] - - [1024, 1024, 1, 3910] - - [324, 9080.4] + - [458, 9080.4] - - [1024, 33708, 1, 3584] - - [319, 10333.0] + - [453, 10333.0] - - [1024, 4096, 1, 3168] - - [320, 9926.27] + - [454, 9926.27] - - [1024, 4096, 1, 3448] - - [327, 9922.42] + - [461, 9922.42] - - [4096, 1024, 1, 3343] - - [326, 9857.23] + - [460, 9857.23] - - [64, 35, 1808, 35] - - [309, 3175.44] + - [443, 3175.44] - - [1024, 4096, 1, 3357] - - [320, 9902.41] + - [454, 9902.41] - - [64, 143, 432, 143] - - [310, 6489.7] + - [444, 6489.7] - - [4096, 1024, 1, 3510] - - [326, 9867.4] + - [460, 9867.4] - - [4096, 1024, 1, 3369] - - [326, 9863.44] + - [460, 9863.44] - - [64, 92, 688, 93] - - [297, 6188.3] + - [431, 6188.3] - - [4096, 1024, 1, 3379] - - [326, 9870.12] + - [460, 9870.12] - - [1024, 4096, 1, 3276] - - [326, 9904.77] + - [460, 9904.77] - - [1024, 4096, 1, 3363] - - [326, 9925.13] + - [460, 9925.13] - - [4096, 1024, 1, 3055] - - [326, 9831.92] + - [460, 9831.92] - - [1024, 4096, 1, 3524] - - [319, 9923.79] + - [453, 9923.79] - - [4096, 1024, 1, 3057] - - [326, 9852.87] + - [460, 9852.87] - - [1024, 33708, 1, 3720] - - [320, 10327.1] + - [454, 10327.1] - - [1024, 4096, 1, 3383] - - [319, 9919.39] + - [453, 9919.39] - - [1024, 4096, 1, 3522] - - [320, 9932.56] + - [454, 9932.56] - - [1024, 33708, 1, 3956] - - [319, 10333.8] + - [453, 10333.8] - - [1024, 4096, 1, 3481] - - [319, 9922.08] + - [453, 9922.08] - - [4096, 1024, 1, 3562] - - [327, 9874.86] + - [461, 9874.86] - - [4096, 1024, 1, 3299] - - [326, 9872.97] + - [460, 9872.97] - - [1024, 4096, 1, 3262] - - [320, 9924.83] + - [454, 9924.83] - - [1024, 4096, 1, 3840] - - [319, 9961.84] + - [453, 9961.84] - - [1024, 33708, 1, 4026] - - [319, 10334.3] + - [453, 10334.3] - - [4096, 1024, 1, 3168] - - [320, 9878.45] + - [454, 9878.45] - - [64, 101, 624, 101] - - [300, 5734.72] + - [434, 5734.72] - - [1024, 4096, 1, 3999] - - [319, 9947.1] + - [453, 9947.1] - - [1024, 4096, 1, 3549] - - [319, 9923.3] + - [453, 9923.3] - - [4096, 1024, 1, 3375] - - [326, 9868.89] + - [460, 9868.89] - - [1024, 4096, 1, 3496] - - [327, 9928.67] + - [461, 9928.67] - - [64, 29, 2176, 29] - - [296, 3290.02] + - [430, 3290.02] - - [1024, 4096, 1, 3190] - - [327, 9897.61] + - [461, 9897.61] - - [4096, 1024, 1, 3273] - - [327, 9853.65] + - [461, 9853.65] - - [1024, 4096, 1, 3406] - - [326, 9907.04] + - [460, 9907.04] - - [4096, 1024, 1, 4005] - - [319, 9907.97] + - [453, 9907.97] - - [4096, 1024, 1, 3555] - - [326, 9878.96] + - [460, 9878.96] - - [4096, 1024, 1, 2505] - - [326, 9785.1] + - [460, 9785.1] - - [1024, 4096, 1, 3460] - - [326, 9930.24] + - [460, 9930.24] - - [64, 17, 3632, 17] - - [297, 1917.27] + - [431, 1917.27] - - [1024, 4096, 1, 3579] - - [320, 9920.94] + - [454, 9920.94] - - [1024, 33708, 1, 4030] - - [320, 10327.7] + - [454, 10327.7] - - [1024, 4096, 1, 3510] - - [320, 9931.31] + - [454, 9931.31] - - [1024, 1024, 1, 3969] - - [317, 9020.83] + - [451, 9020.83] - - [1024, 4096, 1, 3282] - - [327, 9920.05] + - [461, 9920.05] - - [1024, 4096, 1, 3377] - - [319, 9927.34] + - [453, 9927.34] - - [1024, 4096, 1, 2935] - - [327, 9903.48] + - [461, 9903.48] - - [64, 41, 1552, 41] - - [297, 3740.48] + - [431, 3740.48] - - [1024, 4096, 1, 3498] - - [319, 9915.01] + - [453, 9915.01] - - [1024, 4096, 1, 3593] - - [326, 9925.64] + - [460, 9925.64] - - [1024, 1024, 1, 3948] - - [325, 9009.03] + - [459, 9009.03] - - [4096, 1024, 1, 3226] - - [327, 9854.75] + - [461, 9854.75] - - [1024, 4096, 1, 2499] - - [326, 9904.82] + - [460, 9904.82] - - [1024, 4096, 1, 3296] - - [319, 9926.89] + - [453, 9926.89] - - [1024, 4096, 1, 3455] - - [326, 9917.52] + - [460, 9917.52] - - [1024, 4096, 1, 3399] - - [320, 9919.7] + - [454, 9919.7] - - [1024, 4096, 1, 3205] - - [319, 9917.74] + - [453, 9917.74] - - [4096, 1024, 1, 4026] - - [327, 9897.81] + - [461, 9897.81] - - [1024, 4096, 1, 3484] - - [319, 9915.53] + - [453, 9915.53] - - [4096, 1024, 1, 3302] - - [327, 9862.8] + - [461, 9862.8] - - [1024, 4096, 1, 3485] - - [327, 9913.0] + - [461, 9913.0] - - [1024, 1024, 1, 3996] - - [325, 9008.77] + - [459, 9008.77] - - [1024, 4096, 1, 3126] - - [320, 9910.16] + - [454, 9910.16] - - [1024, 4096, 1, 4050] - - [319, 9951.21] + - [453, 9951.21] - - [4096, 1024, 1, 3235] - - [320, 9870.74] + - [454, 9870.74] - - [1024, 33708, 1, 3955] - - [319, 10336.1] + - [453, 10336.1] - - [1024, 4096, 1, 3342] - - [319, 9903.85] + - [453, 9903.85] - - [1024, 1024, 1, 3900] - - [324, 9082.92] + - [458, 9082.92] - - [1024, 4096, 1, 3397] - - [327, 9922.7] + - [461, 9922.7] - - [4096, 1024, 1, 3491] - - [327, 9880.75] + - [461, 9880.75] - - [1024, 4096, 1, 3503] - - [319, 9923.28] + - [453, 9923.28] - - [1024, 4096, 1, 3140] - - [320, 9908.41] + - [454, 9908.41] - - [4096, 1024, 1, 3121] - - [326, 9860.32] + - [460, 9860.32] - - [4096, 1024, 1, 3276] - - [326, 9854.19] + - [460, 9854.19] - - [1024, 4096, 1, 3321] - - [327, 9917.86] + - [461, 9917.86] - - [1024, 4096, 1, 3870] - - [327, 9931.07] + - [461, 9931.07] - - [4096, 1024, 1, 3475] - - [326, 9877.58] + - [460, 9877.58] - - [1024, 4096, 1, 2984] - - [326, 9895.59] + - [460, 9895.59] - - [4096, 1024, 1, 3363] - - [320, 9873.44] + - [454, 9873.44] - - [1024, 4096, 1, 3582] - - [326, 9920.87] + - [460, 9920.87] - - [4096, 1024, 1, 3509] - - [326, 9886.86] + - [460, 9886.86] - - [1024, 4096, 1, 3426] - - [319, 9928.86] + - [453, 9928.86] - - [4096, 1024, 1, 3136] - - [326, 9872.61] + - [460, 9872.61] - - [1024, 4096, 1, 3232] - - [327, 9926.29] + - [461, 9926.29] - - [4096, 1024, 1, 3103] - - [326, 9839.03] + - [460, 9839.03] - - [1024, 4096, 1, 3335] - - [320, 9913.37] + - [454, 9913.37] - - [1024, 4096, 1, 3900] - - [319, 9938.01] + - [453, 9938.01] - - [4096, 1024, 1, 3512] - - [320, 9877.26] + - [454, 9877.26] - - [4096, 1024, 1, 3222] - - [326, 9859.77] + - [460, 9859.77] - - [1024, 4096, 1, 3165] - - [326, 9899.71] + - [460, 9899.71] - - [4096, 1024, 1, 3408] - - [326, 9899.68] + - [460, 9899.68] - - [4096, 1024, 1, 3751] - - [326, 9891.49] + - [460, 9891.49] - - [1024, 4096, 1, 3318] - - [319, 9913.42] + - [453, 9913.42] - - [4096, 1024, 1, 3442] - - [327, 9880.21] + - [461, 9880.21] - - [1024, 4096, 1, 3413] - - [326, 9921.9] + - [460, 9921.9] - - [4096, 1024, 1, 3524] - - [326, 9879.22] + - [460, 9879.22] - - [1024, 4096, 1, 3976] - - [327, 9945.57] + - [461, 9945.57] - - [1024, 4096, 1, 3475] - - [327, 9932.51] + - [461, 9932.51] - - [1024, 4096, 1, 3534] - - [319, 9911.49] + - [453, 9911.49] - - [4096, 1024, 1, 3301] - - [326, 9872.75] + - [460, 9872.75] - - [4096, 1024, 1, 3248] - - [326, 9878.22] + - [460, 9878.22] - - [1024, 4096, 1, 2977] - - [320, 9899.93] + - [454, 9899.93] - - [4096, 1024, 1, 3346] - - [326, 9876.07] + - [460, 9876.07] - - [1024, 4096, 1, 3451] - - [319, 9920.16] + - [453, 9920.16] - - [1024, 4096, 1, 3257] - - [320, 9905.02] + - [454, 9905.02] - - [1024, 1024, 1, 3640] - - [318, 8983.39] + - [452, 8983.39] - - [1024, 4096, 1, 3356] - - [319, 9904.48] + - [453, 9904.48] - - [4096, 1024, 1, 3348] - - [327, 9872.53] + - [461, 9872.53] - - [4096, 1024, 1, 3335] - - [326, 9865.82] + - [460, 9865.82] - - [4096, 1024, 1, 3505] - - [326, 9888.88] + - [460, 9888.88] - - [1024, 4096, 1, 3490] - - [319, 9938.0] + - [453, 9938.0] - - [4096, 1024, 1, 3447] - - [326, 9865.39] + - [460, 9865.39] - - [1024, 4096, 1, 3267] - - [327, 9919.32] + - [461, 9919.32] - - [4096, 1024, 1, 3230] - - [326, 9853.2] + - [460, 9853.2] - - [4096, 1024, 1, 3455] - - [326, 9862.44] + - [460, 9862.44] - - [1024, 4096, 1, 3925] - - [319, 9945.64] + - [453, 9945.64] - - [1024, 4096, 1, 3362] - - [320, 9921.63] + - [454, 9921.63] - - [4096, 1024, 1, 3969] - - [327, 9911.98] + - [461, 9911.98] - - [4096, 1024, 1, 3527] - - [326, 9882.87] + - [460, 9882.87] - - [1024, 4096, 1, 3585] - - [320, 9946.52] + - [454, 9946.52] - - [4096, 1024, 1, 3063] - - [326, 9854.03] + - [460, 9854.03] - - [4096, 1024, 1, 3435] - - [326, 9867.13] + - [460, 9867.13] - - [4096, 1024, 1, 3366] - - [327, 9864.02] + - [461, 9864.02] - - [4096, 1024, 1, 3581] - - [319, 9868.57] + - [453, 9868.57] - - [1024, 33708, 1, 3906] - - [319, 10339.3] + - [453, 10339.3] - - [1024, 4096, 1, 3464] - - [327, 9916.21] + - [461, 9916.21] - - [1024, 4096, 1, 3440] - - [326, 9945.25] + - [460, 9945.25] - - [4096, 1024, 1, 3143] - - [326, 9846.76] + - [460, 9846.76] - - [1024, 4096, 1, 3349] - - [320, 9912.83] + - [454, 9912.83] - - [4096, 1024, 1, 3416] - - [326, 9885.13] + - [460, 9885.13] - - [4096, 1024, 1, 3365] - - [326, 9876.0] + - [460, 9876.0] - - [1024, 4096, 1, 3470] - - [327, 9914.98] + - [461, 9914.98] - - [4096, 1024, 1, 3287] - - [326, 9860.69] + - [460, 9860.69] - - [1024, 4096, 1, 3441] - - [327, 9928.98] + - [461, 9928.98] - - [4096, 1024, 1, 3224] - - [326, 9857.83] + - [460, 9857.83] - - [1024, 4096, 1, 3387] - - [319, 9911.72] + - [453, 9911.72] - - [1024, 4096, 1, 3547] - - [319, 9920.36] + - [453, 9920.36] - - [4096, 1024, 1, 3478] - - [320, 9882.9] + - [454, 9882.9] - - [4096, 1024, 1, 3548] - - [327, 9869.45] + - [461, 9869.45] - - [1024, 33708, 1, 4020] - - [319, 10345.3] + - [453, 10345.3] - - [4096, 1024, 1, 3320] - - [326, 9863.74] + - [460, 9863.74] - - [1024, 4096, 1, 3906] - - [326, 9942.67] + - [460, 9942.67] - - [4096, 1024, 1, 3796] - - [326, 9899.13] + - [460, 9899.13] - - [1024, 4096, 1, 3306] - - [319, 9902.4] + - [453, 9902.4] - - [1024, 4096, 1, 3401] - - [327, 9913.95] + - [461, 9913.95] - - [64, 147, 432, 147] - - [310, 6626.6] + - [444, 6626.6] - - [1024, 4096, 1, 3215] - - [327, 9911.24] + - [461, 9911.24] - - [4096, 1024, 1, 4012] - - [327, 9898.2] + - [461, 9898.2] - - [1024, 4096, 1, 2765] - - [327, 9863.73] + - [461, 9863.73] - - [4096, 1024, 1, 3554] - - [320, 9883.52] + - [454, 9883.52] - - [4096, 1024, 1, 3423] - - [326, 9866.72] + - [460, 9866.72] - - [1024, 1024, 1, 3751] - - [324, 9006.36] + - [458, 9006.36] - - [1024, 4096, 1, 3562] - - [320, 9922.08] + - [454, 9922.08] - - [1024, 4096, 1, 3489] - - [319, 9936.78] + - [453, 9936.78] - - [4096, 1024, 1, 3358] - - [326, 9858.22] + - [460, 9858.22] - - [4096, 1024, 1, 3270] - - [327, 9850.84] + - [461, 9850.84] - - [1024, 4096, 1, 3293] - - [319, 9905.33] + - [453, 9905.33] - - [1024, 4096, 1, 3376] - - [319, 9934.98] + - [453, 9934.98] - - [4096, 1024, 1, 3245] - - [326, 9852.52] + - [460, 9852.52] - - [4096, 1024, 1, 3541] - - [326, 9887.22] + - [460, 9887.22] - - [4096, 1024, 1, 3443] - - [326, 9871.73] + - [460, 9871.73] - - [4096, 1024, 1, 3438] - - [327, 9863.86] + - [461, 9863.86] - - [4096, 1024, 1, 3244] - - [326, 9859.76] + - [460, 9859.76] - - [1024, 4096, 1, 3365] - - [326, 9922.1] + - [460, 9922.1] - - [1024, 4096, 1, 3299] - - [320, 9923.38] + - [454, 9923.38] - - [4096, 1024, 1, 3840] - - [326, 9914.75] + - [460, 9914.75] - - [1024, 4096, 1, 3471] - - [327, 9918.38] + - [461, 9918.38] - - [1024, 4096, 1, 3398] - - [319, 9918.99] + - [453, 9918.99] - - [4096, 1024, 1, 3162] - - [326, 9843.93] + - [460, 9843.93] - - [1024, 4096, 1, 4005] - - [320, 9947.87] + - [454, 9947.87] - - [4096, 1024, 1, 3579] - - [326, 9868.25] + - [460, 9868.25] - - [64, 18, 3440, 18] - - [302, 2059.33] + - [436, 2059.33] - - [64, 177, 352, 177] - - [321, 7315.4] + - [455, 7315.4] - - [1024, 4096, 1, 3121] - - [327, 9930.34] + - [461, 9930.34] - - [4096, 1024, 1, 3441] - - [326, 9883.28] + - [460, 9883.28] - - [4096, 1024, 1, 3422] - - [326, 9858.41] + - [460, 9858.41] - - [4096, 1024, 1, 3444] - - [326, 9887.03] + - [460, 9887.03] - - [1024, 4096, 1, 3337] - - [320, 9911.45] + - [454, 9911.45] - - [4096, 1024, 1, 3550] - - [319, 9871.87] + - [453, 9871.87] - - [1024, 4096, 1, 3477] - - [319, 9930.65] + - [453, 9930.65] - - [4096, 1024, 1, 3490] - - [326, 9878.45] + - [460, 9878.45] - - [4096, 1024, 1, 3585] - - [326, 9893.63] + - [460, 9893.63] - - [1024, 4096, 1, 3143] - - [319, 9901.19] + - [453, 9901.19] - - [1024, 33708, 1, 3876] - - [320, 10330.8] + - [454, 10330.8] - - [1024, 4096, 1, 3320] - - [327, 9913.18] + - [461, 9913.18] - - [1024, 4096, 1, 3423] - - [327, 9914.14] + - [461, 9914.14] - - [1024, 4096, 1, 3894] - - [319, 9944.47] + - [453, 9944.47] - - [4096, 1024, 1, 3410] - - [326, 9878.67] + - [460, 9878.67] - - [1024, 4096, 1, 3561] - - [319, 9926.68] + - [453, 9926.68] - - [4096, 1024, 1, 3492] - - [320, 9872.92] + - [454, 9872.92] - - [64, 85, 752, 85] - - [297, 5734.35] + - [431, 5734.35] - - [36548, 1024, 1, 3712] - - [329, 10367.6] + - [463, 10367.6] - - [4096, 2048, 1, 128] - - [330, 8743.93] + - [464, 8743.93] - - [1024, 1024, 1, 3712] - - [331, 9976.29] + - [465, 9976.29] - - [1024, 1024, 1, 128] - - [328, 5765.47] + - [462, 5765.47] - - [4096, 3072, 1, 128] - - [330, 8869.11] + - [464, 8869.11] + - - [768, 3072, 1, 4096] + - [476, 10028.8] + - - [64, 256, 192, 256] + - [470, 8791.65] + - - [768, 2, 1, 16] + - [473, 5.05484] + - - [768, 768, 1, 64] + - [469, 3469.65] + - - [768, 768, 1, 4096] + - [477, 7475.1] + - - [768, 30522, 1, 1280] + - [480, 10297.0] + - - [64, 128, 384, 128] + - [470, 7660.93] + - - [768, 30522, 1, 320] + - [478, 10008.0] + - - [768, 768, 1, 32] + - [467, 2359.4] + - - [3072, 768, 1, 4096] + - [476, 10033.8] + - - [768, 30522, 1, 640] + - [479, 10206.8] + - - [64, 64, 768, 64] + - [468, 5494.82] + - - [768, 768, 1, 640] + - [477, 6721.74] + - - [768, 768, 1, 16] + - [466, 1203.82] + - - [768, 768, 1, 1280] + - [475, 7138.67] + - - [768, 2, 1, 32] + - [471, 11.9154] + - - [2048, 2048, 1, 512] + - [491, 9607.67] + - - [512, 32, 1, 200] + - [484, 422.368] + - - [1024, 1, 1, 200] + - [487, 24.7154] + - - [1600, 1024, 1, 512] + - [482, 8116.01] + - - [560, 1024, 1, 200] + - [481, 4810.84] + - - [1024, 1024, 1, 512] + - [490, 8614.84] + - - [2048, 1, 1, 512] + - [485, 81.0086] + - - [512, 512, 1, 200] + - [483, 4398.49] + - - [100, 2048, 1, 512] + - [488, 4443.22] + - - [1024, 1024, 1, 200] + - [489, 6990.61] + - - [1024, 64, 1, 512] + - [486, 2853.37] + - - [1024, 256, 1, 18944] + - [510, 9196.51] + - - [256, 3328, 1, 8976] + - [500, 8299.36] + - - [1024, 256, 1, 4352] + - [508, 8813.84] + - - [256, 9728, 1, 8976] + - [503, 9638.58] + - - [1024, 256, 1, 3072] + - [510, 8640.73] + - - [768, 2048, 1, 256] + - [502, 8663.03] + - - [1024, 256, 1, 19968] + - [507, 9220.96] + - - [256, 12800, 1, 8976] + - [497, 9418.52] + - - [1024, 256, 1, 3328] + - [511, 8682.58] + - - [256, 10240, 1, 8976] + - [504, 10137.8] + - - [1024, 256, 1, 15104] + - [509, 9167.13] + - - [256, 10496, 1, 8976] + - [497, 9858.48] + - - [1024, 256, 1, 2816] + - [512, 8575.81] + - - [1024, 256, 1, 4608] + - [507, 8861.31] + - - [256, 11264, 1, 8976] + - [494, 9627.79] + - - [1024, 256, 1, 6400] + - [507, 8985.33] + - - [1024, 256, 1, 16128] + - [507, 9170.36] + - - [256, 44505, 1, 8976] + - [501, 10331.9] + - - [256, 6144, 1, 8976] + - [504, 10395.1] + - - [1024, 256, 1, 5120] + - [509, 8881.63] + - - [1024, 256, 1, 7936] + - [512, 9023.24] + - - [256, 3840, 1, 8976] + - [499, 9541.38] + - - [1024, 256, 1, 21248] + - [507, 9209.82] + - - [1024, 256, 1, 12032] + - [509, 9156.27] + - - [256, 8192, 1, 8976] + - [506, 10374.5] + - - [1024, 256, 1, 3584] + - [508, 8712.3] + - - [1024, 256, 1, 14336] + - [509, 9162.61] + - - [256, 7168, 1, 8976] + - [495, 9554.96] + - - [1024, 256, 1, 13568] + - [507, 9165.14] + - - [256, 4096, 1, 8976] + - [499, 10146.7] + - - [1024, 256, 1, 4096] + - [508, 8783.98] + - - [256, 2560, 1, 8976] + - [498, 8381.66] + - - [256, 20992, 1, 8976] + - [497, 9989.96] + - - [256, 4352, 1, 8976] + - [498, 9635.02] + - - [256, 33536, 1, 8976] + - [497, 10218.2] + - - [256, 3584, 1, 8976] + - [499, 8924.6] + - - [256, 26112, 1, 8976] + - [498, 10272.4] + - - [256, 14336, 1, 8976] + - [502, 10217.4] + - - [1024, 256, 1, 14848] + - [509, 9185.29] + - - [1024, 256, 1, 8448] + - [510, 9025.99] + - - [1024, 256, 1, 28672] + - [507, 9256.5] + - - [1024, 256, 1, 5632] + - [507, 8932.79] + - - [256, 22016, 1, 8976] + - [502, 10152.0] + - - [1024, 256, 1, 33536] + - [507, 9243.17] + - - [256, 5120, 1, 8976] + - [493, 9418.15] + - - [256, 11520, 1, 8976] + - [500, 9701.1] + - - [256, 19968, 1, 8976] + - [498, 10228.1] + - - [1024, 256, 1, 5376] + - [509, 8892.62] + - - [1024, 256, 1, 22016] + - [507, 9244.34] + - - [256, 8960, 1, 8976] + - [498, 9841.41] + - - [1024, 256, 1, 15872] + - [507, 9223.25] + - - [256, 17408, 1, 8976] + - [502, 9785.87] + - - [256, 5632, 1, 8976] + - [502, 9564.32] + - - [256, 32512, 1, 8976] + - [501, 10358.0] + - - [256, 11008, 1, 8976] + - [494, 9445.23] + - - [1024, 256, 1, 6144] + - [509, 8955.91] + - - [256, 4864, 1, 8976] + - [494, 8979.45] + - - [256, 15104, 1, 8976] + - [497, 10007.1] + - - [1024, 256, 1, 9984] + - [507, 9110.53] + - - [256, 1280, 1, 8976] + - [493, 5944.44] + - - [1024, 256, 1, 1024] + - [509, 7005.2] + - - [1024, 256, 1, 9728] + - [509, 9066.29] + - - [1024, 256, 1, 10496] + - [507, 9118.15] + - - [256, 11776, 1, 8976] + - [504, 9911.74] + - - [256, 12544, 1, 8976] + - [497, 9235.35] + - - [1024, 256, 1, 17152] + - [507, 9152.31] + - - [1024, 256, 1, 11520] + - [509, 9146.87] + - - [1024, 256, 1, 21504] + - [509, 9207.52] + - - [256, 17152, 1, 8976] + - [496, 9654.81] + - - [1024, 256, 1, 17408] + - [507, 9181.27] + - - [256, 15872, 1, 8976] + - [505, 10086.5] + - - [256, 18688, 1, 8976] + - [498, 9612.57] + - - [256, 5888, 1, 8976] + - [502, 9988.43] + - - [512, 2048, 1, 256] + - [492, 7678.46] + - - [1024, 256, 1, 7680] + - [510, 9033.06] + - - [1024, 256, 1, 1280] + - [512, 7767.33] + - - [256, 14848, 1, 8976] + - [498, 9852.76] + - - [256, 9984, 1, 8976] + - [504, 9908.97] + - - [256, 20480, 1, 8976] + - [502, 10337.2] + - - [1024, 256, 1, 8192] + - [509, 9044.42] + - - [1024, 256, 1, 19712] + - [508, 9184.28] + - - [256, 13568, 1, 8976] + - [498, 9927.92] + - - [256, 13312, 1, 8976] + - [497, 9758.01] + - - [256, 2816, 1, 8976] + - [497, 9191.53] + - - [1024, 256, 1, 2304] + - [508, 8445.01] + - - [256, 21248, 1, 8976] + - [498, 10127.6] + - - [256, 16128, 1, 8976] + - [506, 10238.5] + - - [256, 512, 36, 98] + - [529, 7994.95] + - - [64, 192, 36, 25088] + - [598, 8613.99] + - - [128, 128, 64, 25] + - [528, 2540.25] + - - [256, 256, 64, 56] + - [529, 6924.66] + - - [512, 486, 36, 800] + - [536, 8994.94] + - - [512, 512, 36, 1568] + - [547, 9872.48] + - - [64, 192, 64, 3200] + - [592, 9295.99] + - - [256, 384, 36, 4096] + - [592, 9334.71] + - - [128, 256, 64, 32] + - [531, 4280.0] + - - [64, 128, 64, 23104] + - [598, 10103.2] + - - [128, 256, 64, 9] + - [522, 1709.73] + - - [256, 512, 36, 784] + - [532, 9520.83] + - - [256, 324, 36, 32] + - [570, 4473.48] + - - [512, 512, 36, 33] + - [541, 5925.27] + - - [16, 32, 36, 5760] + - [545, 1448.9] + - - [192, 384, 64, 128] + - [592, 8618.53] + - - [512, 512, 64, 72] + - [548, 8260.22] + - - [128, 128, 64, 1600] + - [521, 9008.48] + - - [512, 512, 36, 128] + - [592, 8871.72] + - - [192, 384, 64, 2304] + - [521, 9657.26] + - - [384, 256, 64, 450] + - [557, 9539.03] + - - [3, 64, 36, 6272] + - [545, 509.884] + - - [3, 64, 64, 2888] + - [574, 708.721] + - - [384, 256, 64, 2304] + - [557, 10287.6] + - - [512, 512, 64, 144] + - [592, 9226.8] + - - [256, 256, 36, 6272] + - [532, 9607.38] + - - [80, 192, 64, 4608] + - [593, 7348.03] + - - [64, 64, 36, 3136] + - [580, 5959.15] + - - [256, 384, 64, 2304] + - [557, 10283.5] + - - [512, 512, 36, 66] + - [541, 7618.18] + - - [128, 256, 64, 800] + - [567, 9611.25] + - - [64, 128, 36, 30] + - [523, 1242.71] + - - [192, 256, 36, 512] + - [592, 8658.07] + - - [256, 512, 64, 200] + - [592, 9153.97] + - - [256, 512, 64, 25] + - [570, 5349.98] + - - [3, 64, 64, 46208] + - [573, 808.662] + - - [128, 256, 36, 1568] + - [565, 8528.72] + - - [64, 128, 64, 11552] + - [598, 9997.1] + - - [128, 192, 64, 946] + - [592, 9198.48] + - - [64, 192, 64, 12800] + - [553, 9000.76] + - - [224, 224, 64, 128] + - [530, 6312.17] + - - [128, 256, 64, 288] + - [592, 8697.97] + - - [64, 64, 64, 826] + - [535, 6650.31] + - - [256, 384, 64, 1152] + - [567, 10106.9] + - - [3, 64, 64, 92416] + - [573, 812.131] + - - [32, 32, 36, 43808] + - [514, 2813.19] + - - [160, 320, 64, 288] + - [524, 8090.96] + - - [1, 16, 36, 23040] + - [561, 42.7667] + - - [128, 256, 36, 128] + - [539, 6049.58] + - - [128, 128, 64, 3360] + - [592, 9200.06] + - - [128, 128, 64, 420] + - [592, 8131.6] + - - [64, 128, 64, 361] + - [529, 6938.08] + - - [512, 512, 36, 16] + - [585, 3797.76] + - - [384, 256, 36, 800] + - [526, 9151.75] + - - [192, 384, 36, 4096] + - [526, 8867.67] + - - [64, 64, 64, 1600] + - [578, 7931.84] + - - [256, 384, 64, 576] + - [558, 9745.9] + - - [512, 512, 64, 14] + - [541, 3638.28] + - - [512, 512, 36, 8] + - [516, 2279.61] + - - [512, 486, 64, 128] + - [532, 8337.93] + - - [1, 16, 64, 640] + - [566, 50.0512] + - - [64, 96, 64, 288] + - [591, 5708.07] + - - [96, 96, 36, 1568] + - [560, 6866.85] + - - [256, 256, 36, 128] + - [564, 7703.92] + - - [64, 128, 36, 53824] + - [552, 6331.41] + - - [256, 256, 36, 32] + - [548, 4648.96] + - - [192, 256, 64, 288] + - [592, 8987.89] + - - [256, 256, 36, 16] + - [562, 2912.81] + - - [128, 256, 36, 3200] + - [565, 8680.37] + - - [160, 320, 64, 512] + - [524, 8449.54] + - - [128, 160, 36, 512] + - [535, 7215.07] + - - [96, 96, 36, 2592] + - [530, 7104.89] + - - [64, 96, 64, 800] + - [560, 7268.42] + - - [147, 64, 36, 18816] + - [576, 7116.36] + - - [160, 320, 36, 512] + - [530, 7874.92] + - - [256, 512, 36, 4] + - [569, 1034.88] + - - [96, 128, 64, 946] + - [552, 7901.17] + - - [256, 324, 64, 1568] + - [557, 8589.63] + - - [128, 128, 64, 50] + - [548, 4070.66] + - - [35, 96, 36, 8960] + - [542, 4207.4] + - - [32, 64, 36, 43808] + - [583, 4390.91] + - - [160, 224, 36, 128] + - [530, 5447.02] + - - [64, 64, 64, 81] + - [555, 2391.28] + - - [256, 256, 36, 3200] + - [521, 9559.65] + - - [256, 256, 36, 210] + - [532, 8414.71] + - - [192, 384, 64, 576] + - [592, 9468.85] + - - [512, 512, 64, 800] + - [567, 10096.5] + - - [512, 24, 36, 800] + - [518, 4761.87] + - - [64, 64, 64, 13216] + - [579, 8491.51] + - - [192, 224, 64, 1152] + - [535, 8769.16] + - - [256, 256, 64, 1152] + - [557, 9988.19] + - - [512, 486, 64, 512] + - [567, 9254.77] + - - [128, 128, 36, 784] + - [530, 7468.16] + - - [256, 512, 64, 1600] + - [554, 10232.6] + - - [512, 512, 64, 9] + - [548, 2599.88] + - - [96, 128, 64, 288] + - [560, 6599.53] + - - [64, 96, 36, 512] + - [560, 5073.85] + - - [256, 512, 36, 1568] + - [592, 9637.91] + - - [128, 128, 64, 400] + - [592, 8192.1] + - - [128, 128, 64, 800] + - [592, 8716.44] + - - [96, 128, 36, 512] + - [580, 6757.03] + - - [16, 32, 36, 360] + - [543, 754.136] + - - [128, 256, 64, 3200] + - [557, 10222.6] + - - [96, 128, 64, 800] + - [560, 7968.0] + - - [256, 512, 64, 4] + - [522, 1098.09] + - - [256, 256, 64, 450] + - [567, 9347.55] + - - [64, 64, 64, 3200] + - [578, 8518.18] + - - [192, 224, 64, 128] + - [538, 7035.27] + - - [128, 128, 64, 288] + - [592, 7751.38] + - - [256, 256, 64, 72] + - [548, 7489.93] + - - [96, 208, 36, 512] + - [560, 6939.21] + - - [128, 256, 36, 3136] + - [535, 8669.43] + - - [64, 64, 36, 3520] + - [530, 6007.57] + - - [64, 128, 36, 1568] + - [593, 6897.8] + - - [160, 320, 64, 242] + - [519, 7873.27] + - - [192, 192, 36, 512] + - [530, 7707.42] + - - [512, 512, 36, 512] + - [592, 9582.52] + - - [1, 16, 64, 10240] + - [544, 71.4511] + - - [128, 128, 36, 512] + - [530, 7149.48] + - - [512, 512, 36, 256] + - [521, 9384.5] + - - [512, 512, 36, 1024] + - [515, 9777.99] + - - [96, 208, 64, 1152] + - [593, 7851.0] + - - [128, 192, 64, 3200] + - [521, 9490.92] + - - [256, 256, 36, 4096] + - [526, 9585.56] + - - [160, 160, 64, 288] + - [560, 7299.9] + - - [256, 256, 64, 896] + - [557, 9850.43] + - - [128, 256, 64, 242] + - [592, 8391.48] + - - [128, 128, 36, 440] + - [535, 6274.82] + - - [96, 128, 36, 1568] + - [580, 7875.13] + - - [192, 384, 36, 1024] + - [526, 8715.82] + - - [64, 96, 36, 10368] + - [597, 7478.69] + - - [128, 256, 64, 100] + - [541, 7085.07] + - - [112, 224, 36, 2048] + - [534, 7556.02] + - - [384, 256, 64, 1152] + - [557, 10102.4] + - - [192, 384, 36, 128] + - [592, 7543.14] + - - [128, 128, 36, 7040] + - [565, 7600.7] + - - [128, 256, 64, 1568] + - [557, 10006.0] + - - [128, 128, 36, 1568] + - [549, 7848.4] + - - [128, 256, 64, 72] + - [572, 6553.7] + - - [256, 256, 36, 12544] + - [586, 9365.14] + - - [256, 256, 36, 105] + - [548, 7286.16] + - - [128, 256, 36, 392] + - [535, 7625.79] + - - [64, 64, 64, 5408] + - [578, 8882.77] + - - [3, 64, 36, 25088] + - [545, 529.042] + - - [384, 256, 36, 1024] + - [592, 9182.85] + - - [35, 96, 36, 13440] + - [599, 4110.39] + - - [128, 256, 64, 1152] + - [557, 9804.97] + - - [256, 324, 64, 32] + - [570, 5043.73] + - - [160, 224, 64, 128] + - [584, 6046.25] + - - [192, 224, 36, 2592] + - [582, 8878.78] + - - [96, 96, 64, 1152] + - [560, 8035.55] + - - [32, 64, 36, 90] + - [517, 964.565] + - - [64, 128, 64, 2888] + - [532, 9047.33] + - - [256, 384, 36, 800] + - [592, 9154.12] + - - [512, 512, 64, 4] + - [589, 1233.72] + - - [192, 320, 36, 128] + - [529, 7388.29] + - - [64, 128, 36, 480] + - [593, 5653.37] + - - [192, 384, 64, 242] + - [592, 9080.09] + - - [256, 486, 64, 32] + - [585, 5909.28] + - - [147, 64, 64, 9702] + - [594, 7319.79] + - - [512, 512, 64, 64] + - [528, 8179.12] + - - [64, 192, 64, 3698] + - [521, 9287.99] + - - [73, 192, 64, 10439] + - [552, 6668.12] + - - [1, 16, 36, 1440] + - [568, 33.5452] + - - [128, 256, 36, 512] + - [535, 7989.25] + - - [512, 512, 64, 576] + - [567, 9951.99] + - - [64, 64, 36, 12544] + - [583, 5872.87] + - - [128, 128, 36, 880] + - [580, 7597.36] + - - [192, 224, 36, 128] + - [538, 6451.3] + - - [64, 64, 64, 800] + - [578, 6916.83] + - - [64, 128, 36, 12544] + - [556, 6395.98] + - - [64, 64, 36, 1568] + - [530, 5536.76] + - - [160, 160, 36, 512] + - [530, 7345.36] + - - [512, 24, 64, 512] + - [520, 5242.98] + - - [3, 64, 36, 3136] + - [545, 475.452] + - - [256, 256, 64, 9] + - [570, 2106.61] + - - [3, 64, 64, 11552] + - [573, 785.227] + - - [128, 256, 36, 12544] + - [588, 8792.23] + - - [128, 128, 36, 3136] + - [549, 8098.56] + - - [256, 512, 36, 3136] + - [532, 9694.49] + - - [64, 64, 36, 196] + - [546, 2757.86] + - - [144, 288, 36, 512] + - [580, 7077.99] + - - [256, 24, 64, 32] + - [559, 1483.93] + - - [384, 384, 36, 800] + - [521, 9246.6] + - - [512, 512, 64, 1600] + - [567, 10277.4] + - - [112, 224, 36, 512] + - [535, 6744.88] + - - [128, 128, 36, 49] + - [541, 2716.39] + - - [512, 512, 36, 4] + - [569, 1156.62] + - - [35, 96, 64, 4235] + - [530, 4631.38] + - - [192, 384, 64, 450] + - [521, 9372.3] + - - [256, 256, 36, 1024] + - [592, 9346.74] + - - [112, 224, 64, 1152] + - [535, 7524.05] + - - [256, 512, 64, 400] + - [554, 9598.05] + - - [149, 32, 36, 19072] + - [599, 5811.9] + - - [128, 256, 36, 6272] + - [535, 8754.78] + - - [128, 192, 36, 1568] + - [560, 8195.2] + - - [256, 256, 36, 512] + - [592, 9074.32] + - - [256, 256, 64, 112] + - [592, 8305.65] + - - [512, 512, 64, 18] + - [585, 4324.12] + - - [256, 256, 64, 18] + - [548, 3547.91] + - - [256, 256, 64, 1568] + - [557, 10141.8] + - - [64, 96, 36, 1568] + - [578, 6805.76] + - - [384, 256, 36, 4096] + - [592, 9311.2] + - - [256, 512, 64, 800] + - [567, 9998.45] + - - [256, 384, 36, 2048] + - [592, 9285.44] + - - [3, 64, 36, 200704] + - [574, 547.475] + - - [384, 384, 64, 2304] + - [515, 9901.78] + - - [160, 320, 64, 128] + - [551, 7113.91] + - - [512, 512, 36, 528] + - [521, 9567.75] + - - [160, 320, 36, 128] + - [552, 6411.23] + - - [96, 96, 64, 800] + - [560, 7690.11] + - - [256, 512, 36, 49] + - [548, 6721.35] + - - [384, 384, 64, 450] + - [521, 9523.63] + - - [3, 64, 64, 23104] + - [573, 801.721] + - - [256, 256, 64, 3200] + - [557, 10300.5] + - - [128, 192, 36, 512] + - [535, 7499.85] + - - [192, 192, 64, 288] + - [592, 8774.34] + - - [96, 208, 64, 242] + - [552, 5902.09] + - - [256, 16, 36, 3200] + - [581, 3807.87] + - - [512, 512, 64, 8] + - [559, 2379.85] + - - [64, 128, 64, 5776] + - [532, 9332.84] + - - [512, 512, 64, 288] + - [521, 9522.09] + - - [256, 16, 36, 32] + - [577, 766.105] + - - [128, 192, 64, 288] + - [592, 8527.68] + - - [32, 64, 64, 640] + - [560, 4660.44] + - - [64, 64, 36, 392] + - [560, 3686.5] + - - [384, 384, 36, 1024] + - [526, 9282.58] + - - [64, 64, 36, 11552] + - [590, 5904.88] + - - [96, 128, 36, 6272] + - [580, 8351.09] + - - [128, 256, 36, 16] + - [562, 2144.91] + - - [256, 256, 64, 288] + - [592, 9140.23] + - - [64, 64, 64, 1652] + - [578, 7766.63] + - - [256, 384, 36, 1024] + - [526, 9203.37] + - - [96, 128, 64, 3200] + - [595, 8866.3] + - - [256, 324, 36, 3200] + - [534, 8194.35] + - - [128, 192, 64, 800] + - [592, 9198.13] + - - [64, 128, 64, 10] + - [533, 851.217] + - - [96, 208, 64, 288] + - [560, 6667.68] + - - [64, 96, 36, 2592] + - [542, 7216.98] + - - [64, 128, 64, 160] + - [571, 5191.07] + - - [192, 384, 64, 512] + - [521, 9446.14] + - - [64, 64, 36, 6272] + - [530, 6212.11] + - - [512, 24, 36, 288] + - [527, 3922.57] + - - [128, 128, 64, 1568] + - [521, 9037.96] + - - [112, 224, 64, 242] + - [591, 6399.36] + - - [128, 256, 64, 1600] + - [557, 10010.4] + - - [32, 32, 64, 20000] + - [525, 4378.51] + - - [160, 192, 64, 288] + - [552, 7803.73] + - - [512, 24, 64, 128] + - [513, 3733.9] + - - [512, 512, 36, 32] + - [548, 5935.44] + - - [3, 64, 36, 100352] + - [545, 542.883] + - - [3, 64, 64, 1444] + - [574, 674.259] + - - [512, 512, 36, 3136] + - [515, 9921.2] + - - [128, 256, 64, 6400] + - [575, 10349.4] + - - [256, 256, 36, 2048] + - [592, 9519.09] + - - [128, 160, 64, 288] + - [535, 7549.85] + - - [256, 256, 64, 6400] + - [557, 10392.7] + - - [32, 64, 64, 20000] + - [583, 6493.96] + - - [256, 256, 36, 1680] + - [532, 9513.39] + - - [128, 128, 64, 210] + - [592, 7094.2] + - - [192, 384, 36, 2048] + - [521, 8818.75] + - - [256, 256, 64, 144] + - [592, 8608.71] + - - [384, 384, 36, 4096] + - [526, 9357.04] + - - [160, 320, 64, 1152] + - [552, 8749.58] + - - [384, 256, 36, 2048] + - [592, 9279.73] + - - [256, 512, 36, 392] + - [592, 9252.24] + - - [256, 512, 64, 50] + - [548, 7511.39] + - - [73, 192, 36, 23360] + - [596, 5803.03] + - - [3, 64, 36, 50176] + - [545, 542.137] + - - [384, 384, 36, 2048] + - [521, 9325.9] + - - [256, 384, 64, 450] + - [567, 9528.76] + - - [192, 320, 64, 128] + - [526, 8399.91] + - - [128, 256, 36, 32] + - [541, 3276.9] + - - [160, 192, 36, 512] + - [580, 7752.44] + - - [512, 512, 64, 256] + - [532, 9473.74] + - - [256, 512, 64, 32] + - [570, 6391.42] + - - [384, 384, 64, 576] + - [521, 9614.89] + - - [64, 64, 64, 648] + - [578, 6282.25] + - - [512, 486, 36, 288] + - [592, 8625.03] + - - [32, 64, 36, 1440] + - [530, 3961.6] + - - [144, 288, 64, 242] + - [552, 6347.12] + - - [384, 256, 64, 576] + - [557, 9775.34] + - - [512, 512, 36, 64] + - [528, 7791.38] + - - [448, 384, 64, 128] + - [521, 9132.33] + - - [64, 128, 64, 722] + - [571, 8047.21] + - - [144, 288, 64, 288] + - [580, 6859.5] + - - [512, 512, 64, 224] + - [592, 9427.39] + - - [112, 224, 64, 288] + - [591, 6737.02] + - - [384, 384, 64, 1152] + - [515, 9820.56] + - - [448, 384, 36, 128] + - [592, 8761.41] + - - [64, 64, 64, 100] + - [538, 2708.2] + - - [256, 486, 36, 128] + - [564, 7640.14] + - - [64, 96, 64, 4608] + - [593, 8351.59] + - - [16, 32, 64, 160] + - [517, 736.46] + - - [64, 192, 36, 6272] + - [593, 8041.29] + - - [64, 64, 64, 200] + - [546, 3924.41] + - - [256, 256, 36, 800] + - [592, 9299.65] + - - [64, 128, 36, 6272] + - [590, 6816.46] + - - [32, 64, 64, 40] + - [537, 885.722] + - - [256, 16, 64, 32] + - [587, 1205.36] + - - [192, 384, 36, 800] + - [526, 8673.98] + - - [128, 128, 36, 3200] + - [560, 8538.99] + - - [256, 256, 36, 256] + - [532, 8454.46] + - - [192, 384, 64, 1152] + - [521, 9589.11] + - - [128, 256, 64, 200] + - [531, 8141.22] + - - [64, 96, 64, 1152] + - [560, 7620.98] + - - [128, 128, 36, 392] + - [535, 6175.61] + - - [80, 192, 36, 10368] + - [583, 6497.26] + - - [224, 224, 36, 128] + - [593, 5826.99] + - - [512, 512, 64, 28] + - [548, 5728.91] + - - [256, 16, 64, 1568] + - [563, 4637.3] + - - [144, 288, 64, 1152] + - [580, 7784.34] + - - [256, 256, 64, 576] + - [557, 9596.22] + - - [64, 128, 36, 784] + - [593, 6059.09] + - - [256, 24, 36, 128] + - [527, 2239.94] + - - [256, 256, 64, 2304] + - [557, 10225.8] + - - [192, 384, 36, 512] + - [592, 8549.13] + - - [16, 32, 64, 2560] + - [545, 2153.23] + - - [256, 512, 36, 32] + - [570, 5702.33] + - - [512, 512, 64, 128] + - [592, 9084.21] + - - [128, 128, 64, 200] + - [529, 6972.01] + - - [512, 512, 64, 32] + - [541, 6248.6] + - - [128, 256, 36, 196] + - [541, 6628.86] + - - [8, 384, 64, 6600] + - [573, 2733.99] + - - [149, 32, 64, 8195] + - [535, 6051.01] + - - [35, 96, 64, 6160] + - [580, 4689.45] + - - [64, 64, 36, 1760] + - [530, 5622.34] - null diff --git a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bljk_SB.yaml index 08dd4df6f..096950937 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bljk_SB.yaml @@ -32091,8 +32091,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32255,8 +32255,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32419,8 +32419,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32583,8 +32583,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32747,8 +32747,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32911,8 +32911,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33075,8 +33075,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33239,8 +33239,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33399,8 +33399,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33563,8 +33563,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33723,8 +33723,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33887,8 +33887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34051,8 +34051,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34215,8 +34215,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34379,8 +34379,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34543,8 +34543,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34707,8 +34707,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34871,8 +34871,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35035,8 +35035,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35199,8 +35199,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35363,8 +35363,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35527,8 +35527,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35691,8 +35691,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35855,8 +35855,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36019,8 +36019,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36186,8 +36186,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36349,8 +36349,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36516,8 +36516,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36679,8 +36679,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36846,8 +36846,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37009,8 +37009,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37176,8 +37176,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37339,8 +37339,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37506,8 +37506,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37667,8 +37667,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37828,8 +37828,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37991,8 +37991,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38158,8 +38158,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38323,8 +38323,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38486,8 +38486,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38653,8 +38653,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38816,8 +38816,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38983,8 +38983,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39146,8 +39146,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39309,8 +39309,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39474,8 +39474,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39637,8 +39637,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39800,8 +39800,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39965,8 +39965,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40128,8 +40128,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40291,8 +40291,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40452,8 +40452,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40613,8 +40613,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40774,8 +40774,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40935,8 +40935,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41100,8 +41100,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41263,8 +41263,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41430,8 +41430,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41593,8 +41593,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41756,8 +41756,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41915,8 +41915,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42078,8 +42078,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42239,8 +42239,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42404,8 +42404,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42565,8 +42565,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42726,8 +42726,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42887,8 +42887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43052,8 +43052,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43213,8 +43213,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43374,8 +43374,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43535,8 +43535,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43696,8 +43696,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43857,8 +43857,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44018,8 +44018,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44179,8 +44179,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44340,8 +44340,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44501,8 +44501,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44662,8 +44662,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44823,8 +44823,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44984,8 +44984,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45145,8 +45145,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45306,8 +45306,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45467,8 +45467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45628,8 +45628,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45787,8 +45787,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45947,8 +45947,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46107,8 +46107,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46267,8 +46267,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46427,8 +46427,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46587,8 +46587,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46747,8 +46747,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46911,8 +46911,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47071,8 +47071,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47231,8 +47231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47391,8 +47391,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47551,8 +47551,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47711,8 +47711,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47871,8 +47871,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48035,8 +48035,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48195,8 +48195,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48359,8 +48359,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48519,8 +48519,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48683,8 +48683,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48843,8 +48843,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49003,8 +49003,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49163,8 +49163,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49323,8 +49323,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49483,8 +49483,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49647,8 +49647,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49811,8 +49811,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49975,8 +49975,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50135,8 +50135,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50299,8 +50299,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50463,8 +50463,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50623,8 +50623,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50787,8 +50787,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50951,8 +50951,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51111,8 +51111,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51275,8 +51275,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51439,8 +51439,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51603,8 +51603,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51763,8 +51763,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51927,8 +51927,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52087,8 +52087,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52251,8 +52251,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52415,8 +52415,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52579,8 +52579,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52739,8 +52739,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52903,8 +52903,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53067,8 +53067,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53231,8 +53231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53395,8 +53395,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53559,8 +53559,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53723,8 +53723,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53887,8 +53887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54051,8 +54051,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54215,8 +54215,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54375,8 +54375,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54539,8 +54539,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54703,8 +54703,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54867,8 +54867,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55031,8 +55031,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55195,8 +55195,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55359,8 +55359,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55519,8 +55519,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55679,8 +55679,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55839,8 +55839,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55999,8 +55999,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56159,8 +56159,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56319,8 +56319,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56479,8 +56479,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56639,8 +56639,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56799,8 +56799,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56959,8 +56959,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57119,8 +57119,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57279,8 +57279,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57443,8 +57443,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57607,8 +57607,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57767,8 +57767,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57931,8 +57931,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58095,8 +58095,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58259,8 +58259,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58419,8 +58419,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58583,8 +58583,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58743,8 +58743,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58907,8 +58907,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59071,8 +59071,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59231,8 +59231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59395,8 +59395,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59559,8 +59559,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59723,8 +59723,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59887,8 +59887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60051,8 +60051,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60215,8 +60215,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60379,8 +60379,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60543,8 +60543,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60707,8 +60707,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60871,8 +60871,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61035,8 +61035,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61199,8 +61199,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61363,8 +61363,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61527,8 +61527,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61691,8 +61691,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61855,8 +61855,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62015,8 +62015,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62179,8 +62179,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62343,8 +62343,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62507,8 +62507,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62671,8 +62671,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62831,8 +62831,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62991,8 +62991,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63155,8 +63155,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63319,8 +63319,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63483,8 +63483,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63647,8 +63647,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63807,8 +63807,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63971,8 +63971,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64135,8 +64135,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64295,8 +64295,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64459,8 +64459,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64619,8 +64619,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64783,8 +64783,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64943,8 +64943,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65103,8 +65103,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65267,8 +65267,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65431,8 +65431,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65595,8 +65595,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65759,8 +65759,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65923,8 +65923,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66087,8 +66087,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66251,8 +66251,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66411,8 +66411,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66575,8 +66575,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66739,8 +66739,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66903,8 +66903,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67067,8 +67067,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67231,8 +67231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67395,8 +67395,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67559,8 +67559,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67723,8 +67723,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67887,8 +67887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68051,8 +68051,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68215,8 +68215,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68379,8 +68379,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68543,8 +68543,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68707,8 +68707,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68871,8 +68871,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69035,8 +69035,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69199,8 +69199,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69359,8 +69359,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69519,8 +69519,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69683,8 +69683,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69843,8 +69843,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70003,8 +70003,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70167,8 +70167,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70327,8 +70327,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70491,8 +70491,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70651,8 +70651,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70811,8 +70811,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70975,8 +70975,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71139,8 +71139,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71303,8 +71303,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71467,8 +71467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71631,8 +71631,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71795,8 +71795,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71959,8 +71959,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72123,8 +72123,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72287,8 +72287,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72451,8 +72451,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72611,8 +72611,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72775,8 +72775,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72939,8 +72939,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73103,8 +73103,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73267,8 +73267,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73431,8 +73431,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73595,8 +73595,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73759,8 +73759,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73923,8 +73923,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74087,8 +74087,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74251,8 +74251,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74415,8 +74415,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74579,8 +74579,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74739,8 +74739,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74903,8 +74903,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75067,8 +75067,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75231,8 +75231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75395,8 +75395,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75555,8 +75555,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75715,8 +75715,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75879,8 +75879,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76043,8 +76043,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76203,8 +76203,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76363,8 +76363,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76523,8 +76523,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76683,8 +76683,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76843,8 +76843,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77003,8 +77003,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77163,8 +77163,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77327,8 +77327,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77487,8 +77487,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77647,8 +77647,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77811,8 +77811,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77975,8 +77975,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78139,8 +78139,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78303,8 +78303,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78467,8 +78467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78631,8 +78631,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78795,8 +78795,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78959,8 +78959,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79123,8 +79123,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79292,8 +79292,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79457,8 +79457,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79624,8 +79624,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79791,8 +79791,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79958,8 +79958,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80125,8 +80125,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80294,8 +80294,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80459,8 +80459,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80628,8 +80628,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80795,8 +80795,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80962,8 +80962,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81129,8 +81129,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81296,8 +81296,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81463,8 +81463,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81630,8 +81630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81795,8 +81795,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81962,8 +81962,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82129,8 +82129,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82296,8 +82296,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82463,8 +82463,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82630,8 +82630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82797,8 +82797,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82966,8 +82966,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83133,8 +83133,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83300,8 +83300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83467,8 +83467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83634,8 +83634,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83801,8 +83801,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83968,8 +83968,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84135,8 +84135,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84300,8 +84300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84467,8 +84467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84632,8 +84632,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84799,8 +84799,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84964,8 +84964,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85131,8 +85131,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85300,8 +85300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85467,8 +85467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85634,8 +85634,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85801,8 +85801,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85966,8 +85966,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86133,8 +86133,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86300,8 +86300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86469,8 +86469,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86636,8 +86636,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86803,8 +86803,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86968,8 +86968,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87135,8 +87135,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87304,8 +87304,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87471,8 +87471,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87638,8 +87638,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87805,8 +87805,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87972,8 +87972,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -88139,8 +88139,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -88186,23 +88186,23 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -88210,37 +88210,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 4 - LSPB: 16 + LSPB: 32 LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -88248,10 +88245,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88259,26 +88256,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88288,6 +88293,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -88297,6 +88303,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88311,47 +88318,55 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 559 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id002 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -88359,37 +88374,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 4 - LSPB: 16 + LSPB: 32 LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -88397,10 +88409,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88408,26 +88420,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88437,6 +88457,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -88446,6 +88467,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88460,33 +88482,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 560 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -88501,7 +88531,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -88509,47 +88539,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88557,19 +88588,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -88577,6 +88615,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88586,6 +88625,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -88595,6 +88635,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88609,14 +88650,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 561 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -88627,63 +88675,69 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id004 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2560 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -88692,9 +88746,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88702,26 +88756,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88731,6 +88793,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -88740,6 +88803,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88754,48 +88818,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 562 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -88803,43 +88875,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4352 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 8 - MacroTileA: 128 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88847,26 +88924,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88876,6 +88959,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -88885,6 +88969,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88899,48 +88984,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 563 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id004 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -88948,43 +89043,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4352 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 8 - MacroTileA: 128 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88992,26 +89092,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89021,6 +89127,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -89030,6 +89137,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89044,14 +89152,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 564 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 - SubGroup0: 32 - SubGroup1: 2 - SubGroupA: 32 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -89062,74 +89177,82 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id007 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 16 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89137,26 +89260,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89166,6 +89297,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -89175,6 +89307,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89189,96 +89322,105 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 565 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id006 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 16 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89286,19 +89428,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -89306,6 +89453,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89315,6 +89463,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -89324,6 +89473,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89338,48 +89488,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 566 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id006 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -89387,43 +89547,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2304 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89431,20 +89596,19198 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 - PerformanceSyncLocation: -1 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 567 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 568 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 569 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 570 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 571 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 572 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 512 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 573 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 784 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 574 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR0_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 575 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2080 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 576 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2112 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 577 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4224 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 578 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2112 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 579 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 580 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 581 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 582 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 583 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW4_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 584 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 585 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 586 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 587 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 588 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 589 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 590 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 512 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 591 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1544 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 592 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB1_PGR1_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 520 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 593 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1040 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 594 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 595 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR0_TT2_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 596 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 597 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_AMAS1_EPS0_FL0_GRVW1_GSU8_LPB1_PGR0_PLR1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 598 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_AMAS1_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 599 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 600 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 601 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 602 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 603 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 604 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 605 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 606 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 607 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 608 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 609 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 610 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 611 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 612 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 613 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 614 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 615 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 616 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 617 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 618 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 619 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 620 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 621 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 622 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 623 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 624 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 625 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 626 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 + SubGroup0: 32 + SubGroup1: 2 + SubGroupA: 32 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 627 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 628 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 629 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id008 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 630 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 + SubGroup0: 32 + SubGroup1: 2 + SubGroupA: 32 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 631 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG08_04_08 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 632 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id008 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 633 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_02_04 + SubGroup0: 32 + SubGroup1: 2 + SubGroupA: 32 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id008 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 634 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 48 + LSCB: 16 + LSPA: 4 + LSPB: 12 + LVCA: 48 + LVCB: 16 + LVPA: 4 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3456 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 36 + MacroTileA: 48 + MacroTileB: 36 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 635 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x036x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT06_03_USFGRO01_VW01_WG08_12_02 + SubGroup0: 8 + SubGroup1: 12 + SubGroupA: 8 + SubGroupB: 12 + SuppresssNoLoadLoop: false + ThreadTile: [6, 3] + ThreadTile0: 6 + ThreadTile1: 3 + ThreadTileA: 6 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 12, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 12 + LSCB: 16 + LSPA: 16 + LSPB: 12 + LVCA: 12 + LVCB: 16 + LVPA: 16 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3392 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 36 + MacroTile1: 48 + MacroTileA: 36 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 636 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT036x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG12_16_01 + SubGroup0: 12 + SubGroup1: 16 + SubGroupA: 12 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [12, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 48 + MacroTile1: 48 + MacroTileA: 48 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 637 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 48 + LSCB: 32 + LSPA: 8 + LSPB: 12 + LVCA: 24 + LVCB: 16 + LVPA: 4 + LVPB: 6 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 24 + MacroTileA: 48 + MacroTileB: 24 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 638 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 + SubGroup0: 8 + SubGroup1: 6 + SubGroupA: 8 + SubGroupB: 6 + SuppresssNoLoadLoop: false + ThreadTile: *id011 + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id010 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 24 + LSCB: 32 + LSPA: 8 + LSPB: 6 + LVCA: 24 + LVCB: 32 + LVPA: 8 + LVPB: 6 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 24 + MacroTile1: 24 + MacroTileA: 24 + MacroTileB: 24 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 3 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 639 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT024x024x32_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_04_USFGRO01_VW01_WG08_06_04 + SubGroup0: 8 + SubGroup1: 6 + SubGroupA: 8 + SubGroupB: 6 + SuppresssNoLoadLoop: false + ThreadTile: [3, 4] + ThreadTile0: 3 + ThreadTile1: 4 + ThreadTileA: 3 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id010 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 48 + LSCB: 32 + LSPA: 8 + LSPB: 12 + LVCA: 24 + LVCB: 16 + LVPA: 4 + LVPB: 6 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 24 + MacroTileA: 48 + MacroTileB: 24 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 640 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 + SubGroup0: 8 + SubGroup1: 6 + SubGroupA: 8 + SubGroupB: 6 + SuppresssNoLoadLoop: false + ThreadTile: *id011 + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id010 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 641 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 642 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 643 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 644 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 645 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 646 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 647 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 648 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 649 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 650 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 651 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 652 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 653 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 654 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 655 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 656 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 657 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 658 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 384 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 6 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 659 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x24_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 660 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 661 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 662 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 663 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 664 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 665 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id025 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 666 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 667 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 668 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 669 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 670 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 671 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 672 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 673 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 674 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id025 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 675 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 676 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 677 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 678 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 679 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 680 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 681 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 682 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 683 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 684 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 685 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id025 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 686 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 687 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 688 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -89483,25 +108826,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 567 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 689 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id008 + ThreadTile: *id018 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id004 + WorkGroup: *id023 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -89515,15 +108858,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -89531,27 +108874,31 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4352 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89564,11 +108911,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 8 - MacroTileA: 128 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89576,20 +108923,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -89628,25 +108975,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 568 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 - SubGroup0: 32 - SubGroup1: 2 - SubGroupA: 32 - SubGroupB: 2 + SolutionIndex: 690 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id005 + ThreadTile: *id022 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id007 + VectorWidth: 2 + WorkGroup: *id023 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -89660,7 +109007,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89668,7 +109015,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -89676,37 +109023,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -89715,9 +109062,9 @@ LoopTail: true LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89725,13 +109072,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -89777,26 +109124,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 569 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG08_04_08 - SubGroup0: 8 + SolutionIndex: 691 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 SubGroup1: 4 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -89809,16 +109156,165 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 692 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -89826,26 +109322,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 + LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89858,11 +109358,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89870,20 +109370,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -89922,17 +109422,17 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 570 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 693 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id008 - ThreadTile0: 2 + ThreadTile: *id022 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -89940,8 +109440,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: *id023 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -89954,15 +109454,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -89971,26 +109471,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4352 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90003,11 +109507,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 4 - MacroTileA: 64 - MacroTileB: 4 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90017,18 +109521,18 @@ NonTemporalC: 0 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -90067,25 +109571,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 571 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_02_04 - SubGroup0: 32 - SubGroup1: 2 - SubGroupA: 32 - SubGroupB: 2 + SolutionIndex: 694 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id008 + ThreadTile: *id018 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id007 + WorkGroup: *id023 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -90099,60 +109603,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 8 LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4352 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 12 MacroTile0: 64 - MacroTile1: 4 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90160,20 +109668,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -90212,25 +109720,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 572 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_02_08 - SubGroup0: 16 - SubGroup1: 2 - SubGroupA: 16 - SubGroupB: 2 + SolutionIndex: 695 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id003 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 + VectorWidth: 4 + WorkGroup: *id019 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -90244,7 +109752,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90252,39 +109760,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 48 - LSCB: 16 - LSPA: 4 - LSPB: 12 - LVCA: 48 - LVCB: 16 - LVPA: 4 - LVPB: 12 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3456 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90297,11 +109805,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 36 - MacroTileA: 48 - MacroTileB: 36 + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90309,15 +109817,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 192 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -90361,26 +109869,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 573 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x036x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT06_03_USFGRO01_VW01_WG08_12_02 + SolutionIndex: 696 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 - SubGroup1: 12 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 12 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [6, 3] - ThreadTile0: 6 - ThreadTile1: 3 - ThreadTileA: 6 - ThreadTileB: 3 + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 12, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -90393,7 +109901,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90401,7 +109909,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -90409,48 +109917,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 12 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 12 - LVCA: 12 - LVCB: 16 - LVPA: 16 - LVPB: 12 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 36 - MacroTile1: 48 - MacroTileA: 36 - MacroTileB: 48 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90458,15 +109966,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 192 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -90510,25 +110018,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 574 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT036x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG12_16_01 - SubGroup0: 12 - SubGroup1: 16 - SubGroupA: 12 - SubGroupB: 16 + SolutionIndex: 697 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [12, 16, 1] + VectorWidth: 2 + WorkGroup: *id026 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -90542,7 +110050,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90550,7 +110058,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -90558,48 +110066,197 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 698 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 48 - MacroTile1: 48 - MacroTileA: 48 - MacroTileB: 48 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90607,14 +110264,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -90659,25 +110316,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 575 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG16_16_01 + SolutionIndex: 699 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: *id026 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -90716,39 +110373,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 48 + LSCA: 32 LSCB: 32 LSPA: 8 - LSPB: 12 - LVCA: 24 + LSPB: 8 + LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 6 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 24 - MacroTileA: 48 - MacroTileB: 24 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90756,15 +110413,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 3 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 192 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -90808,25 +110465,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 576 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 + SolutionIndex: 700 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 SubGroup0: 8 - SubGroup1: 6 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 6 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id011 - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id010 + WorkGroup: *id019 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -90848,39 +110505,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 24 + LSCA: 16 LSCB: 32 - LSPA: 8 - LSPB: 6 - LVCA: 24 - LVCB: 32 - LVPA: 8 - LVPB: 6 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90894,10 +110551,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 24 - MacroTile1: 24 - MacroTileA: 24 - MacroTileB: 24 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90905,15 +110562,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 3 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -90957,25 +110614,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 577 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT024x024x32_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_04_USFGRO01_VW01_WG08_06_04 + SolutionIndex: 701 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_04 SubGroup0: 8 - SubGroup1: 6 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 6 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [3, 4] - ThreadTile0: 3 - ThreadTile1: 4 - ThreadTileA: 3 - ThreadTileB: 4 + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id010 + VectorWidth: 2 + WorkGroup: *id023 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -90997,39 +110654,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 48 + LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 12 - LVCA: 24 - LVCB: 16 - LVPA: 4 - LVPB: 6 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91043,10 +110700,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 24 - MacroTileA: 48 - MacroTileB: 24 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91054,15 +110711,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -91106,85 +110763,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 578 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 + SolutionIndex: 702 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 - SubGroup1: 6 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 6 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id011 - ThreadTile0: 6 + ThreadTile: *id020 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id010 + VectorWidth: 4 + WorkGroup: *id023 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 - LSPA: 4 - LSPB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 2 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 832 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -91192,10 +110849,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91203,7 +110860,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 NumLoadsB: 1 @@ -91211,7 +110868,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -91255,46 +110912,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 579 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 703 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 + VectorWidth: 4 + WorkGroup: *id023 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -91304,46 +110961,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 LSPA: 8 LSPB: 16 - LVCA: 8 - LVCB: 4 + LVCA: 32 + LVCB: 16 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91354,13 +111011,13 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -91404,85 +111061,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 580 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 704 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 + ThreadTile: *id022 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 + WorkGroup: *id024 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 8 - LVCB: 2 - LVPA: 2 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -91491,9 +111148,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91501,15 +111158,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -91553,46 +111210,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 581 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 705 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -91602,36 +111259,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 8 - LSPA: 4 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 LSPB: 32 - LVCA: 16 - LVCB: 2 - LVPA: 1 + LVCA: 32 + LVCB: 8 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -91639,9 +111296,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91650,15 +111307,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -91702,48 +111359,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 582 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 706 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: *id027 ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + WorkGroup: *id026 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -91751,36 +111408,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCB: 32 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -91799,15 +111456,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -91851,85 +111508,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 583 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 707 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 + ThreadTile: *id022 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: *id023 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 32 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 LVCB: 8 - LVPA: 2 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -91937,10 +111594,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91948,15 +111605,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92000,85 +111657,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 584 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 708 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id020 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 - LVCA: 8 - LVCB: 4 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -92086,10 +111743,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92097,15 +111754,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92149,96 +111806,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 585 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SolutionIndex: 709 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: *id023 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 - LSPA: 4 - LSPB: 16 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92247,14 +111904,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92298,46 +111955,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 586 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 710 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id020 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: *id024 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -92347,47 +112004,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 LSPA: 8 LSPB: 32 - LVCA: 8 - LVCB: 2 + LVCA: 32 + LVCB: 8 LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92395,15 +112052,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92447,46 +112104,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 587 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 711 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: *id024 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -92496,36 +112153,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -92534,9 +112191,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92544,15 +112201,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92596,46 +112253,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 588 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 712 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id012 + ThreadTile: *id018 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + WorkGroup: *id026 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -92645,36 +112302,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 16 + LSCB: 32 + LSPA: 8 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -92683,9 +112340,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92693,15 +112350,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92745,75 +112402,75 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 589 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 713 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id022 ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + WorkGroup: *id026 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 32 - LVCB: 16 - LVPA: 2 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3200 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -92823,18 +112480,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92843,14 +112500,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92894,48 +112551,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 590 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 714 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 4, 1] + VectorWidth: 4 + WorkGroup: *id023 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -92943,47 +112600,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92991,15 +112648,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -93043,96 +112700,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 591 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 715 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id012 + ThreadTile: *id018 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + WorkGroup: *id026 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 32 - LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 LVCB: 8 - LVPA: 2 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93140,15 +112797,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -93192,46 +112849,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 592 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SolutionIndex: 716 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id020 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -93241,46 +112898,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 8 - LVCB: 4 - LVPA: 2 - LVPB: 4 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -93289,15 +112946,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -93341,46 +112998,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 593 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SolutionIndex: 717 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 + ThreadTile: *id021 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + WorkGroup: *id023 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -93390,36 +113047,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCB: 32 + LSPA: 8 LSPB: 16 LVCA: 32 - LVCB: 8 - LVPA: 2 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -93438,15 +113095,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -93490,75 +113147,75 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 594 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 718 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id022 ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: *id024 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 LVCA: 16 - LVCB: 4 - LVPA: 2 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3328 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -93568,18 +113225,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93587,15 +113244,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -93639,48 +113296,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 595 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_08_01 + SolutionIndex: 719 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 + VectorWidth: 2 + WorkGroup: *id026 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -93688,21 +113345,21 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 2 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3328 @@ -93717,18 +113374,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93736,15 +113393,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -93788,25 +113445,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 596 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 720 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 4 + ThreadTile: *id018 + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 + WorkGroup: *id026 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -93820,7 +113477,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -93828,39 +113485,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 + LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 16 - LVCB: 4 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 384 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -93873,11 +113526,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93885,20 +113538,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 6 - NumLoadsB: 3 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 6 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -93937,26 +113590,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 597 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x24_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 721 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -93969,7 +113622,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -93977,39 +113630,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -94022,11 +113671,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94034,20 +113683,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -94086,96 +113735,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 598 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SolutionIndex: 722 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94183,15 +113832,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -94235,48 +113884,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 599 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 723 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94284,47 +113933,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 32 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94332,14 +113981,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -94384,48 +114033,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 600 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_08_02 + SolutionIndex: 724 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: *id031 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id024 - WorkGroupMapping: 8 + WorkGroup: *id028 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94433,47 +114082,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3584 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94481,20 +114126,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -94533,96 +114178,92 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 601 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 725 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id032 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id019 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 + KernelLanguage: Source + LSCA: 128 + LSCB: 16 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94630,20 +114271,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -94682,35 +114323,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 602 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 726 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -94720,10 +114361,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94731,26 +114372,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly + KernelLanguage: Source LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6400 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -94760,18 +114401,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94779,8 +114420,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -94831,35 +114472,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 603 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 727 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 + ThreadTile: *id030 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id025 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -94869,10 +114510,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94880,47 +114521,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly + KernelLanguage: Source LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94928,8 +114565,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -94941,7 +114578,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -94980,35 +114617,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 604 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 728 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 + ThreadTile: *id029 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -95018,58 +114655,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 + KernelLanguage: Source + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95077,15 +114714,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -95129,35 +114766,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 605 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 729 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -95167,58 +114804,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 + KernelLanguage: Source + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95226,15 +114863,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -95278,35 +114915,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 606 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 730 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 + ThreadTile: *id031 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -95316,58 +114953,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly + KernelLanguage: Source LSCA: 32 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 8 - LVCB: 4 - LVPA: 4 - LVPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95375,20 +115008,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -95427,35 +115060,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 607 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 731 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id032 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -95465,8 +115098,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -95476,47 +115109,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 + KernelLanguage: Source + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 32 - LVCA: 16 + LSPB: 64 + LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95524,15 +115157,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -95576,96 +115209,92 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 608 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 732 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id021 + ThreadTile: *id030 ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 1 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95673,20 +115302,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -95725,96 +115354,92 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 609 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 733 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95822,20 +115447,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -95874,96 +115499,92 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 610 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 734 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 + ThreadTile: *id031 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + KernelLanguage: Source + LSCA: 128 + LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 32 LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95971,20 +115592,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -96023,48 +115644,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 611 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SolutionIndex: 735 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96072,47 +115693,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 16 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96120,20 +115737,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -96172,48 +115789,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 612 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 736 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 + ThreadTile: *id029 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id025 + WorkGroup: *id028 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96221,47 +115838,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly + KernelLanguage: Source LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96269,14 +115886,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -96321,25 +115938,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 613 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 737 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 + ThreadTile: *id029 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: *id028 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -96353,7 +115970,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -96362,7 +115979,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96370,47 +115987,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 - LSPB: 16 + LSPB: 128 LVCA: 32 - LVCB: 16 + LVCB: 2 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96418,13 +116035,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -96470,26 +116087,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 614 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 738 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id021 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -96502,7 +116119,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -96510,45 +116127,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -96556,10 +116173,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96567,15 +116184,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -96619,25 +116236,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 615 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionIndex: 739 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -96657,7 +116274,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -96668,7 +116285,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -96676,39 +116293,35 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 8 + LSPB: 64 + LVCA: 16 LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96716,20 +116329,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -96768,14 +116381,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 616 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 740 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 + ThreadTile: *id035 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -96786,8 +116399,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -96817,7 +116430,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -96827,16 +116440,16 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 32 + LSPA: 16 + LSPB: 64 LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -96846,18 +116459,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96867,13 +116480,13 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -96917,17 +116530,17 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 617 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 + SolutionIndex: 741 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -96935,8 +116548,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -96955,58 +116568,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97014,20 +116623,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -97066,26 +116675,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 618 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 742 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -97106,56 +116715,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97163,14 +116772,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -97215,26 +116824,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 619 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SolutionIndex: 743 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id036 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -97256,7 +116865,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -97264,47 +116873,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97312,14 +116921,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -97364,26 +116973,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 620 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_08_02 + SolutionIndex: 744 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id038 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id024 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -97402,58 +117011,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97461,8 +117066,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -97474,7 +117079,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -97513,26 +117118,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 621 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SolutionIndex: 745 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -97562,7 +117167,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -97579,7 +117184,7 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 @@ -97591,14 +117196,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -97662,26 +117267,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 622 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + SolutionIndex: 746 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id027 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id024 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -97702,35 +117307,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6400 + LdsNumElements: 7680 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -97740,18 +117345,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 96 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97759,14 +117364,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -97811,25 +117416,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 623 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionIndex: 747 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id039 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id025 + VectorWidth: 2 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -97849,58 +117454,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97908,20 +117509,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -97960,25 +117561,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 624 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 748 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -98009,7 +117610,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -98017,39 +117618,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 - LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98057,15 +117658,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -98109,25 +117710,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 625 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionIndex: 749 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -98147,7 +117748,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -98158,7 +117759,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -98168,37 +117769,33 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 32 + LSPA: 16 + LSPB: 64 LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98208,18 +117805,18 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -98258,17 +117855,17 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 626 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 750 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 + ThreadTile: *id035 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -98276,8 +117873,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 1 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -98298,56 +117895,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98355,8 +117952,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -98407,26 +118004,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 627 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 751 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -98445,58 +118042,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98504,20 +118097,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -98556,26 +118149,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 628 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 752 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 + ThreadTile: *id036 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -98597,7 +118190,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -98605,47 +118198,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98653,14 +118246,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -98705,25 +118298,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 629 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 753 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id036 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -98745,19 +118338,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -98766,35 +118359,35 @@ LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 8 - LVCB: 4 - LVPA: 4 - LVPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98802,15 +118395,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -98854,25 +118447,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 630 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionIndex: 754 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id038 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id019 + VectorWidth: 2 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -98892,58 +118485,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98951,20 +118540,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -99003,25 +118592,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 631 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 755 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -99043,56 +118632,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99100,13 +118689,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -99152,26 +118741,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 632 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 756 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -99184,7 +118773,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -99192,56 +118781,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99249,15 +118838,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -99301,26 +118890,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 633 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id027 + SolutionIndex: 757 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id039 ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -99333,13 +118922,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -99350,7 +118939,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -99358,39 +118947,35 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 + LSCA: 128 + LSCB: 16 LSPA: 8 LSPB: 64 - LVCA: 16 - LVCB: 2 + LVCA: 32 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99398,20 +118983,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -99450,14 +119035,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 634 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 758 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id027 + ThreadTile: *id033 ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 @@ -99468,7 +119053,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -99482,7 +119067,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -99490,56 +119075,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99547,14 +119132,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -99599,26 +119184,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 635 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 759 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -99631,64 +119216,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 128 LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99696,20 +119277,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -99748,25 +119329,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 636 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 760 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -99780,7 +119361,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -99797,7 +119378,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -99806,38 +119387,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 128 LVCA: 32 - LVCB: 8 + LVCB: 2 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 24 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99845,13 +119426,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -99897,14 +119478,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 637 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 761 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id027 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 @@ -99915,7 +119496,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -99929,64 +119510,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99994,20 +119571,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -100046,26 +119623,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 638 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 762 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -100078,7 +119655,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -100086,56 +119663,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100143,13 +119720,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -100195,26 +119772,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 639 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 763 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -100233,7 +119810,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -100244,7 +119821,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -100252,39 +119829,35 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100292,20 +119865,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -100344,14 +119917,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 640 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 764 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 + ThreadTile: *id035 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -100362,7 +119935,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -100393,7 +119966,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -100412,7 +119985,7 @@ LdcEqualsLdd: false LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -100422,18 +119995,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100441,14 +120014,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -100493,17 +120066,17 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 641 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 + SolutionIndex: 765 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -100511,7 +120084,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -100531,58 +120104,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 32 LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCB: 8 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100590,20 +120159,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -100642,25 +120211,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 642 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SolutionIndex: 766 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -100682,56 +120251,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100739,14 +120308,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -100791,26 +120360,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 643 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 767 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -100829,7 +120398,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -100840,7 +120409,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -100857,30 +120426,26 @@ LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100888,20 +120453,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -100940,14 +120505,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 644 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 768 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id027 + ThreadTile: *id033 ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 @@ -100958,8 +120523,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -100980,56 +120545,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101037,14 +120602,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -101089,26 +120654,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 645 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionIndex: 769 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -101127,7 +120692,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -101138,7 +120703,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -101146,39 +120711,35 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101186,20 +120747,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -101238,14 +120799,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 646 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 770 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 + ThreadTile: *id035 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -101256,7 +120817,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -101287,7 +120848,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -101306,7 +120867,7 @@ LdcEqualsLdd: false LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -101316,18 +120877,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101335,14 +120896,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -101387,17 +120948,17 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 647 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 + SolutionIndex: 771 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -101405,7 +120966,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -101425,7 +120986,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -101436,7 +120997,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -101444,39 +121005,35 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 16 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101484,20 +121041,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -101536,17 +121093,17 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 648 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_08_02 + SolutionIndex: 772 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 + ThreadTile: *id037 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -101554,7 +121111,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id024 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -101585,7 +121142,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -101602,7 +121159,7 @@ LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 14336 LdsNumElementsAlignedA: 4096 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 @@ -101614,14 +121171,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -101685,25 +121242,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 649 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + SolutionIndex: 773 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id027 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id024 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -101723,58 +121280,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101782,20 +121335,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -101834,26 +121387,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 650 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 774 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -101874,56 +121427,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 LSPA: 8 - LSPB: 8 + LSPB: 32 LVCA: 32 - LVCB: 32 - LVPA: 4 + LVCB: 8 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101931,14 +121484,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -101983,96 +121536,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 651 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 775 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelLanguage: Source + LSCA: 16 + LSCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102081,14 +121634,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -102132,47 +121685,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 652 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SolutionIndex: 776 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id023 + VectorWidth: 2 + WorkGroup: *id040 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 2 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -102180,48 +121733,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Assembly + KernelLanguage: Source LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCB: 2 + LSPA: 2 + LSPB: 32 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 2 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102229,15 +121782,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -102281,26 +121834,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 653 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 777 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x02_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id040 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -102313,7 +121866,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102321,39 +121874,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -102366,11 +121919,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102378,7 +121931,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -102430,26 +121983,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 654 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SolutionIndex: 778 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT02_02_USFGRO01_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppresssNoLoadLoop: true + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id044 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -102462,7 +122015,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102470,56 +122023,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCA: 16 + LSCB: 4 + LSPA: 4 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102527,15 +122080,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -102579,26 +122132,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 655 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SolutionIndex: 779 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id042 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -102611,7 +122164,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102619,8 +122172,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -102631,44 +122184,44 @@ GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 8 + LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 512 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102676,15 +122229,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -102728,26 +122281,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 656 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SolutionIndex: 780 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 + ThreadTile: *id041 + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id024 - WorkGroupMapping: 8 + WorkGroup: *id043 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -102760,7 +122313,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102768,7 +122321,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -102780,44 +122333,44 @@ GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 + LSCA: 16 + LSCB: 4 + LSPA: 4 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102825,15 +122378,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -102877,25 +122430,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 657 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 781 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id018 + ThreadTile: *id041 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 + WorkGroup: *id042 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -102909,7 +122462,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102917,7 +122470,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -102930,26 +122483,26 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 + LSCA: 8 + LSCB: 8 + LSPA: 8 LSPB: 8 - LVCA: 16 - LVCB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -102962,10 +122515,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 2 + MacroTile0: 8 MacroTile1: 8 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -102976,13 +122529,13 @@ NonTemporalC: 0 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -103026,35 +122579,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 658 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 + SolutionIndex: 782 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 + SubGroup0: 4 SubGroup1: 4 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id018 + ThreadTile: *id041 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: *id043 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -103064,54 +122617,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 + KernelLanguage: Assembly + LSCA: 16 LSCB: 8 LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103119,20 +122676,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103171,92 +122728,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 659 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SolutionIndex: 783 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103264,20 +122825,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103316,35 +122877,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 660 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SolutionIndex: 784 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 2 + WorkGroup: *id043 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -103354,58 +122915,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 16 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 16 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 16 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103413,8 +122974,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -103465,48 +123026,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 661 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SolutionIndex: 785 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 2 + WorkGroup: *id044 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103519,27 +123080,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 3360 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -103550,11 +123111,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103562,14 +123123,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -103614,77 +123175,81 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 662 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id031 + SolutionIndex: 786 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id045 ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id046 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 16 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3360 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -103695,11 +123260,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103707,20 +123272,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103759,35 +123324,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 663 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id032 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SolutionIndex: 787 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id045 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id028 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id046 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -103797,8 +123362,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -103815,21 +123380,25 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 + KernelLanguage: Assembly + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -103841,9 +123410,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -103852,20 +123421,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103904,35 +123473,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 664 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 788 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 - ThreadTile0: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id045 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -103942,8 +123511,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -103960,7 +123529,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 LSCB: 16 LSPA: 8 @@ -103970,15 +123539,15 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104053,35 +123622,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 665 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SolutionIndex: 789 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG32_08_01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id046 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -104092,38 +123661,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 32 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 2592 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104135,10 +123704,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104147,20 +123716,20 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104198,35 +123767,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 666 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SolutionIndex: 790 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x128x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG08_32_01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppresssNoLoadLoop: false - ThreadTile: *id029 + ThreadTile: *id047 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 1 + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -104237,42 +123806,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSPA: 2 + LSPB: 16 + LVCA: 128 + LVCB: 16 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2592 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104284,10 +123849,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104296,20 +123861,20 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104347,96 +123912,92 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 667 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SolutionIndex: 791 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW02_WG32_08_01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id029 + ThreadTile: *id047 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 2 + WorkGroup: *id048 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 16 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 256 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSPA: 1 + LSPB: 16 + LVCA: 256 + LVCB: 16 + LVPA: 1 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4640 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 4096 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104445,20 +124006,20 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104496,35 +124057,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 668 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SolutionIndex: 792 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x16_DTL1_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT08_04_USFGRO01_VW02_WG32_08_01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id031 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: *id050 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 2 + WorkGroup: *id048 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -104535,38 +124096,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 1600 LdsOffsetA: 0 - LdsOffsetB: 1536 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104578,10 +124139,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104589,21 +124150,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104641,35 +124202,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 669 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 + SolutionIndex: 793 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG16_08_01 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id032 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + ThreadTile: *id047 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id028 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -104680,42 +124241,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 + KernelLanguage: Assembly + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104727,9 +124284,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -104738,21 +124295,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104790,46 +124347,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 670 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 794 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id030 - ThreadTile0: 8 + ThreadTile: *id047 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id049 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -104846,21 +124403,21 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104871,7 +124428,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -104885,19 +124442,19 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104935,46 +124492,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 671 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 795 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id029 + ThreadTile: *id047 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 1 + WorkGroup: *id049 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -104991,21 +124548,21 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 3136 LdsOffsetA: 0 LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105016,11 +124573,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105031,18 +124588,18 @@ NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -105080,75 +124637,77 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 672 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 796 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id031 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: *id050 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 1 + WorkGroup: *id049 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 - LSCB: 32 - LSPA: 8 + LSCB: 8 + LSPA: 2 LSPB: 32 - LVCA: 32 + LVCA: 128 LVCB: 8 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -105161,7 +124720,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -105174,9 +124733,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 @@ -105186,7 +124743,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105225,73 +124782,72 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 673 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 797 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW01_GSU08_PGR1_PLR1_TT08_04_USFGRO01_VW01_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 + ThreadTile: *id051 ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 1 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false - DirectToLdsA: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 1 LSPB: 32 - LVCA: 16 + LVCA: 256 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 1 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 2304 LdsOffsetA: 0 LdsOffsetB: 2048 LdsPadA: 0 @@ -105302,15 +124858,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105318,14 +124874,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -105370,79 +124924,78 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 674 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 + SolutionIndex: 798 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT08_04_USFGRO01_VW04_WG32_08_01_WGM08 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id051 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id053 WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -105455,10 +125008,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -105467,14 +125020,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -105519,31 +125070,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 675 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 799 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW02_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 + ThreadTile: *id051 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id052 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -105557,37 +125108,36 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -105606,9 +125156,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105616,13 +125166,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -105668,31 +125216,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 676 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 + SolutionIndex: 800 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW02_WG32_08_01_WGM01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id054 + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 + VectorWidth: 2 + WorkGroup: *id053 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -105706,8 +125254,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -105725,22 +125272,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 128 - LVCA: 32 + LSPB: 64 + LVCA: 16 LVCB: 2 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -105754,10 +125301,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105765,15 +125312,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -105817,17 +125362,16 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 677 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SolutionIndex: 801 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 + SubGroupB: 8 + ThreadTile: *id054 + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -105835,13 +125379,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: *id055 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -105849,14 +125394,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -105875,17 +125419,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 64 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 2 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -105898,7 +125446,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -105910,20 +125458,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105962,31 +125508,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 678 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SolutionIndex: 802 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM08 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 + SubGroupB: 8 + ThreadTile: *id054 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: *id055 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -106000,8 +125546,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106028,13 +125573,9 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -106049,9 +125590,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106059,20 +125600,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106111,17 +125650,16 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 679 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 - ThreadTile0: 4 + SolutionIndex: 803 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id051 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -106129,13 +125667,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [8, 32, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -106149,8 +125688,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106177,9 +125715,13 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -106194,9 +125736,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106204,20 +125746,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106256,31 +125796,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 680 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + SolutionIndex: 804 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id036 + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -106294,8 +125834,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106322,13 +125861,9 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -106355,8 +125890,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -106366,7 +125899,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106405,14 +125938,13 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 681 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + SolutionIndex: 805 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id036 + ThreadTile: *id054 ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 @@ -106423,162 +125955,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 - WorkGroupMappingType: B - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 16 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesSummation: 1 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: false - UseBeta: true - UseInitialStrides: false - SolutionIndex: 682 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id038 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -106592,8 +125976,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106649,8 +126032,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -106699,14 +126080,13 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 683 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + SolutionIndex: 806 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 + ThreadTile: *id051 ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -106717,13 +126097,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -106737,8 +126118,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106798,8 +126178,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -106848,14 +126226,13 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 684 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + SolutionIndex: 807 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 + ThreadTile: *id051 ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -106866,13 +126243,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -106886,37 +126264,36 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -106935,9 +126312,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106945,14 +126322,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -106997,31 +126372,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 685 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id039 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SolutionIndex: 808 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id054 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 + VectorWidth: 4 + WorkGroup: *id053 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -107035,8 +126410,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -107063,9 +126437,13 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107080,9 +126458,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107090,20 +126468,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -107142,17 +126518,16 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 686 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 + SolutionIndex: 809 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM08 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id054 + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -107160,30 +126535,32 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: *id053 + WorkGroupMapping: 8 WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -107196,25 +126573,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107227,11 +126604,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107239,15 +126616,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -107291,33 +126671,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 687 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 810 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -107329,8 +126719,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -107345,21 +126735,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107373,10 +126767,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107384,20 +126778,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -107436,48 +126833,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 688 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 811 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -107490,21 +126897,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -107521,10 +126928,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -107533,15 +126940,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -107585,17 +126995,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 689 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 812 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -107603,15 +127020,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -107623,8 +127043,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -107639,21 +127059,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107667,9 +127091,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -107678,20 +127102,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -107730,17 +127157,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 690 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 813 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id036 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -107748,15 +127182,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -107769,7 +127206,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -107784,25 +127221,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107816,9 +127253,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -107827,15 +127264,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -107879,17 +127319,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 691 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 814 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id036 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -107897,28 +127344,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -107933,31 +127383,31 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -107965,10 +127415,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107976,15 +127426,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108028,48 +127481,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 692 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 815 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW2_GSU1_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id038 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -108082,23 +127545,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -108109,7 +127576,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -108123,18 +127590,21 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -108173,14 +127643,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 693 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 816 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -108191,15 +127668,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -108212,7 +127692,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -108227,42 +127707,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 16 + LVCA: 8 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108270,15 +127750,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108322,17 +127805,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 694 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 817 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -108340,15 +127830,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [8, 4, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -108361,42 +127854,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 + LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -108409,9 +127902,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108419,15 +127912,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108471,33 +127967,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 695 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 818 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id039 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -108509,10 +128015,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -108525,38 +128031,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108564,20 +128074,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -108616,33 +128129,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 696 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 819 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -108655,7 +128178,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -108670,42 +128193,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108713,15 +128236,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108765,17 +128291,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 697 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 820 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -108783,28 +128316,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -108819,38 +128355,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 + LSCA: 32 + LSCB: 32 LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 + LSPB: 8 + LVCA: 8 + LVCB: 8 LVPA: 2 - LVPB: 32 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetB: 3072 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108858,20 +128398,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -108910,46 +128453,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 698 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 821 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [8, 4, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -108964,42 +128517,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109007,15 +128560,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -109059,46 +128615,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 699 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 822 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -109113,23 +128679,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 + LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -109140,11 +128710,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109152,20 +128722,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -109204,48 +128777,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 700 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 823 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -109258,42 +128841,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109301,15 +128884,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -109353,46 +128939,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 701 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 824 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -109407,34 +129003,38 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -109448,18 +129048,21 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -109498,48 +129101,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 702 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 825 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -109552,42 +129165,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCA: 128 + LSCB: 16 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109597,13 +129210,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -109647,17 +129263,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 703 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 826 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 - ThreadTile0: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -109665,15 +129288,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -109685,8 +129311,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -109701,38 +129327,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109740,20 +129370,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -109792,17 +129425,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 704 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 827 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -109810,28 +129450,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -109846,27 +129489,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 16 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -109877,7 +129520,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -109891,13 +129534,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -109941,14 +129587,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 705 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 828 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -109959,30 +129612,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -109995,38 +129651,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110034,20 +129694,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -110086,46 +129749,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 706 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 829 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -110140,42 +129813,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110183,15 +129856,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -110235,17 +129911,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 707 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 830 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -110253,30 +129936,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -110289,38 +129975,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCA: 128 + LSCB: 16 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110330,18 +130020,21 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -110380,17 +130073,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 708 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 831 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 - ThreadTile0: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -110398,15 +130098,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -110419,7 +130122,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -110434,7 +130137,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 @@ -110446,30 +130149,30 @@ LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7296 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110477,15 +130180,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -110529,14 +130235,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 709 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 832 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -110547,28 +130260,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -110583,23 +130299,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 16 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -110610,7 +130330,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -110624,18 +130344,21 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -110674,14 +130397,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 710 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 833 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -110692,30 +130422,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -110728,42 +130461,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110771,15 +130504,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -110823,17 +130559,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 711 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 834 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -110841,15 +130584,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -110861,8 +130607,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -110877,38 +130623,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110916,20 +130666,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -110968,48 +130721,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 712 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 835 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -111022,25 +130785,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111053,11 +130816,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111065,15 +130828,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111094,6 +130860,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111103,6 +130870,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111117,79 +130885,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 713 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 836 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 4 - LSPA: 4 - LSPB: 16 - LVCA: 16 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111202,11 +130980,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111214,15 +130992,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111243,6 +131024,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111252,6 +131034,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111266,48 +131049,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 714 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 837 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id040 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -111319,26 +131112,26 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 2 - LSPA: 2 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111351,10 +131144,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 256 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 256 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -111363,15 +131156,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111392,6 +131188,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111401,6 +131198,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111415,96 +131213,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 715 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x02_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [4, 4] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 838 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x32x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG64_4_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 64 + SubGroup1: 4 + SubGroupA: 64 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id040 - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111512,15 +131320,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111541,6 +131352,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111550,6 +131362,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111564,79 +131377,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 716 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 839 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS1_PGR1_SNLL1_TT4_4_WG32_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id044 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 4 - LSPA: 4 - LSPB: 16 - LVCA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111649,11 +131472,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111661,15 +131484,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111690,6 +131516,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111699,6 +131526,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111713,33 +131541,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 717 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 840 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM7 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id042 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 7 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -111752,57 +131590,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111810,15 +131648,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111839,6 +131680,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111848,6 +131690,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111862,79 +131705,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 718 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 841 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM15 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id043 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 15 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 4 - LSPA: 4 - LSPB: 16 - LVCA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111947,11 +131800,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111959,15 +131812,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111988,6 +131844,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111997,6 +131854,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112011,33 +131869,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 719 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 842 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id042 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -112050,57 +131918,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112108,15 +131976,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -112137,6 +132008,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112146,6 +132018,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112160,33 +132033,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 720 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 843 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM17 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id043 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 17 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -112198,57 +132081,53 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 256 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 256 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 256 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -112257,20 +132136,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -112286,6 +132168,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112295,6 +132178,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112309,33 +132193,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 721 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 844 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM17 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 64 + SubGroup1: 4 + SubGroupA: 64 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 17 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -112348,57 +132242,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112406,15 +132300,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -112435,6 +132332,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112444,6 +132342,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112458,95 +132357,101 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 722 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 845 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id043 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 256 + LSCB: 8 + LSPA: 4 LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 256 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 256 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -112555,20 +132460,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -112584,6 +132492,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112593,6 +132502,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112607,48 +132517,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 723 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 846 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 64 + SubGroup1: 4 + SubGroupA: 64 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id044 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -112664,24 +132584,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3360 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112692,11 +132612,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112706,13 +132626,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -112733,6 +132656,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112742,6 +132666,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112756,14 +132681,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 724 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id045 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 847 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -112774,30 +132706,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id046 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -112813,28 +132748,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3360 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -112842,10 +132777,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112855,13 +132790,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -112882,6 +132820,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112891,6 +132830,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112905,33 +132845,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 725 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 - SubGroup0: 32 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 848 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id045 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id046 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -112944,7 +132894,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -112959,7 +132909,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 @@ -112971,15 +132921,15 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112992,9 +132942,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113002,15 +132952,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -113031,6 +132984,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113040,6 +132994,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113054,18 +133009,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 726 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 849 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_16_1_WGM7 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id045 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -113073,14 +133035,17 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 7 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -113093,7 +133058,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -113108,41 +133073,41 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -113151,15 +133116,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -113180,6 +133148,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113189,6 +133158,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113203,13 +133173,20 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 727 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG32_08_01 - SubGroup0: 32 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 850 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM7 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: true + SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 @@ -113221,15 +133198,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id046 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 7 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -113241,54 +133221,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2592 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113297,20 +133281,23 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -113325,6 +133312,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113334,6 +133322,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113348,33 +133337,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 728 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x128x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG08_32_01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppresssNoLoadLoop: false - ThreadTile: *id047 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 851 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM11 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 11 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -113386,39 +133385,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 2 - LSPB: 16 - LVCA: 128 - LVCB: 16 - LVPA: 2 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2592 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -113430,10 +133433,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113442,20 +133445,23 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -113470,6 +133476,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113479,6 +133486,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113493,92 +133501,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 729 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW02_WG32_08_01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id047 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 852 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id048 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 16 - DirectToLds: true - DirectToLdsA: true + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 + LSCA: 64 LSCB: 16 - LSPA: 1 - LSPB: 16 - LVCA: 256 - LVCB: 16 - LVPA: 1 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4640 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113586,21 +133608,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -113615,6 +133640,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113624,6 +133650,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113638,33 +133665,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 730 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x16_DTL1_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT08_04_USFGRO01_VW02_WG32_08_01 - SubGroup0: 32 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 853 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id050 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id048 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -113676,50 +133713,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1600 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -113731,21 +133772,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -113760,6 +133804,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113769,6 +133814,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113783,81 +133829,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 731 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 854 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_8_2_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id047 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -113878,19 +133938,22 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -113905,6 +133968,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113914,6 +133978,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113928,48 +133993,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 732 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 855 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id047 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id049 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -113982,23 +134057,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114009,10 +134088,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -114021,21 +134100,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -114050,6 +134132,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114059,6 +134142,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114073,17 +134157,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 733 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 856 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -114091,15 +134182,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id049 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -114111,8 +134205,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -114127,23 +134221,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114155,9 +134253,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -114166,21 +134264,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -114195,6 +134296,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114204,6 +134306,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114218,17 +134321,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 734 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 857 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id050 - ThreadTile0: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -114236,59 +134346,64 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id049 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: false + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114301,7 +134416,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -114314,12 +134429,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -114340,6 +134460,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114349,6 +134470,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114363,74 +134485,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 735 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW01_GSU08_PGR1_PLR1_TT08_04_USFGRO01_VW01_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 858 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id052 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true - DirectToLdsA: true + DepthU: 16 + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 - LSCB: 8 - LSPA: 1 - LSPB: 32 - LVCA: 256 - LVCB: 8 - LVPA: 1 + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 32 + LVCB: 4 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114439,15 +134576,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -114459,14 +134596,19 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -114482,6 +134624,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114491,6 +134634,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114505,74 +134649,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 736 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT08_04_USFGRO01_VW04_WG32_08_01_WGM08 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id051 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 859 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 32 + SubGroupA: 16 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id053 - WorkGroupMapping: 8 + WorkGroup: [16, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 64 + LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -114589,10 +134744,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -114601,13 +134756,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -114628,6 +134788,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114637,6 +134798,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114651,33 +134813,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 737 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW02_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 860 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 - ThreadTile0: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id052 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -114689,8 +134861,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -114698,20 +134871,20 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -114748,12 +134921,19 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -114765,6 +134945,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -114774,6 +134955,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114783,6 +134965,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114797,33 +134980,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 738 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW02_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 861 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id053 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -114835,9 +135026,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -114850,25 +135042,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 16 - LVCB: 2 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114882,9 +135074,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -114899,7 +135091,12 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -114911,6 +135108,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -114920,6 +135118,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114929,6 +135128,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114943,33 +135143,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 739 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 862 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id055 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -114981,9 +135191,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -114996,25 +135207,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 16 - LVCB: 2 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115028,9 +135239,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -115041,11 +135252,18 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -115057,6 +135275,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -115066,6 +135285,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115075,6 +135295,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115089,47 +135310,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 740 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM08 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 863 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id055 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -115142,21 +135372,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115169,11 +135403,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115186,19 +135420,25 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -115208,6 +135448,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115217,6 +135458,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115231,33 +135473,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 741 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id051 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 864 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -115269,7 +135521,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -115284,25 +135537,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115316,10 +135569,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115327,13 +135580,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -115345,6 +135605,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -115354,6 +135615,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115363,6 +135625,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115377,33 +135640,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 742 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 865 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id052 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -115415,6 +135686,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -115430,21 +135702,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115458,9 +135734,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -115469,24 +135745,30 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -115496,6 +135778,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115505,6 +135788,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115519,33 +135803,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 743 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 866 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id054 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id052 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -115557,7 +135851,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -115572,7 +135867,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 @@ -115584,9 +135879,13 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115601,9 +135900,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115611,24 +135910,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -115638,6 +135945,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115647,6 +135955,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115661,33 +135970,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 744 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 867 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id052 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -115699,6 +136016,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -115714,7 +136032,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 @@ -115726,9 +136044,9 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -115747,9 +136065,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115757,13 +136075,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -115775,6 +136098,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -115784,6 +136108,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115793,6 +136118,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115807,80 +136133,91 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 745 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 868 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id052 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 32 + LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3344 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115891,10 +136228,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -115903,13 +136240,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -115921,6 +136265,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -115930,6 +136275,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115939,6 +136285,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115953,33 +136300,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 746 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 869 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id053 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -115991,42 +136346,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116038,9 +136390,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -116049,24 +136401,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -116076,6 +136436,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116085,6 +136446,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116099,28 +136461,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 747 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM08 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id054 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 870 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id053 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116132,49 +136501,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116185,10 +136550,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -116197,12 +136562,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -116213,13 +136580,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -116229,6 +136597,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116238,6 +136607,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116256,8 +136626,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 748 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + SolutionIndex: 871 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116265,24 +136635,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116294,7 +136662,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -116302,41 +136670,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 + LSCA: 96 + LSCB: 8 + LSPA: 5 LSPB: 64 - LVCA: 32 + LVCA: 48 LVCB: 4 - LVPA: 2 - LVPB: 16 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3344 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116347,11 +136715,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116359,12 +136727,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -116382,6 +136750,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -116391,6 +136760,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116400,6 +136770,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116418,8 +136789,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 749 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM4 + SolutionIndex: 872 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116428,22 +136799,22 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -116456,15 +136827,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -116472,33 +136843,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6688 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116509,10 +136880,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -116521,12 +136892,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -116538,12 +136911,13 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -116553,6 +136927,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116562,6 +136937,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116580,8 +136956,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 750 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 873 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116590,23 +136966,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116625,42 +136999,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6688 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1600 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116672,10 +137046,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116683,12 +137057,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -116700,12 +137076,13 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -116715,6 +137092,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116724,6 +137102,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116742,8 +137121,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 751 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 874 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116752,23 +137131,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116788,41 +137165,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6688 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1600 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116834,10 +137211,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116845,12 +137222,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -116862,12 +137239,13 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -116877,6 +137255,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116886,6 +137265,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116904,8 +137284,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 752 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM32 + SolutionIndex: 875 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116914,19 +137294,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -116942,15 +137322,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -116958,48 +137338,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117007,12 +137387,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -117030,6 +137412,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -117039,6 +137422,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117048,6 +137432,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117066,33 +137451,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 753 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW2_GSU1_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_2_WGM4 + SolutionIndex: 876 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117192,6 +137575,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -117201,6 +137585,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117210,6 +137595,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117228,8 +137614,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 754 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + SolutionIndex: 877 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117244,13 +137630,13 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -117266,16 +137652,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -117291,28 +137677,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -117320,10 +137706,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117331,13 +137717,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -117354,6 +137742,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -117363,6 +137752,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117372,6 +137762,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117390,33 +137781,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 755 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + SolutionIndex: 878 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 4, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117428,7 +137817,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -117437,7 +137826,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -117454,21 +137843,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -117481,7 +137870,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -117497,7 +137886,7 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -117516,6 +137905,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -117525,6 +137915,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117534,6 +137925,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117552,8 +137944,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 756 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + SolutionIndex: 879 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117568,15 +137960,15 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -117596,10 +137988,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -117618,36 +138010,32 @@ LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117655,8 +138043,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -117671,13 +138061,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -117687,6 +138078,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117696,6 +138088,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117714,33 +138107,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 757 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + SolutionIndex: 880 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117759,7 +138150,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -117777,38 +138168,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -117817,11 +138208,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -117840,6 +138233,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -117849,6 +138243,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117858,6 +138253,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117876,33 +138272,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 758 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM4 + SolutionIndex: 881 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117914,7 +138308,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -117939,28 +138333,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 128 + LSCB: 16 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 640 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -117968,10 +138362,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117979,13 +138373,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118002,6 +138396,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -118011,6 +138406,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118020,6 +138416,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118038,31 +138435,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 759 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + SolutionIndex: 882 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 4, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -118076,14 +138473,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -118102,27 +138499,23 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -118131,9 +138524,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118141,11 +138534,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -118157,13 +138552,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -118173,6 +138569,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118182,6 +138579,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118200,33 +138598,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 760 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + SolutionIndex: 883 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118245,7 +138641,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -118305,6 +138701,8 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -118326,6 +138724,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -118335,6 +138734,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118344,6 +138744,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118362,8 +138763,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 761 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 884 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118378,7 +138779,7 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -118387,8 +138788,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118409,7 +138808,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -118425,39 +138824,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118465,11 +138864,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -118488,6 +138887,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -118497,6 +138897,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118506,6 +138907,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118524,28 +138926,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 762 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 885 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -118562,53 +138964,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -118617,9 +139015,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118628,12 +139026,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118643,13 +139043,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -118659,6 +139060,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118668,6 +139070,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118686,8 +139089,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 763 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM8 + SolutionIndex: 886 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118695,24 +139098,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118724,53 +139125,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 576 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -118778,10 +139175,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118789,13 +139186,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118805,13 +139204,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -118821,6 +139221,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118830,6 +139231,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118848,8 +139250,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 764 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 887 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118857,24 +139259,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118886,64 +139286,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118952,12 +139348,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118967,13 +139365,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -118983,6 +139382,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118992,6 +139392,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119010,8 +139411,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 765 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 888 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119019,24 +139420,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119054,43 +139453,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 LVPA: 2 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -119102,10 +139497,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119113,13 +139508,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119129,13 +139526,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -119145,6 +139543,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119154,6 +139553,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119172,33 +139572,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 766 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 889 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119216,58 +139614,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119276,12 +139670,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119291,13 +139687,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -119307,6 +139704,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119316,6 +139714,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119334,8 +139733,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 767 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 890 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119343,24 +139742,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119372,16 +139769,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -119398,27 +139795,27 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -119427,9 +139824,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119439,11 +139836,13 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119460,6 +139859,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -119469,6 +139869,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119478,6 +139879,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119496,8 +139898,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 768 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + SolutionIndex: 891 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119506,23 +139908,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119534,7 +139934,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -119559,28 +139959,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 8 LSPB: 32 - LVCA: 32 - LVCB: 8 + LVCA: 16 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -119588,9 +139988,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -119603,9 +140003,9 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119622,6 +140022,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -119631,6 +140032,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119640,6 +140042,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119658,8 +140061,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 769 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 892 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119668,10 +140071,10 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -119679,10 +140082,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -119696,14 +140099,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -119722,38 +140125,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LVCB: 2 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119761,13 +140164,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119784,6 +140189,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -119793,6 +140199,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119802,6 +140209,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119820,8 +140228,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 770 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 893 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119830,23 +140238,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119858,7 +140264,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -119867,7 +140273,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -119883,24 +140289,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 16 LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -119911,11 +140317,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119923,13 +140329,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119946,6 +140352,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -119955,6 +140362,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119964,6 +140372,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119982,20 +140391,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 771 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM32 + SolutionIndex: 894 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -120003,10 +140412,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -120026,10 +140435,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120047,33 +140456,29 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 32 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -120085,13 +140490,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -120101,13 +140508,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -120117,6 +140525,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120126,6 +140535,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120144,8 +140554,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 772 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + SolutionIndex: 895 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120153,7 +140563,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -120165,12 +140575,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120182,13 +140590,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -120208,27 +140616,23 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCB: 16 + LSPA: 8 LSPB: 32 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 4 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -120247,13 +140651,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -120263,13 +140667,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -120279,6 +140684,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120288,6 +140694,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120306,8 +140713,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 773 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + SolutionIndex: 896 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL1_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120315,7 +140722,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -120327,10 +140734,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -120350,43 +140757,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1032 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -120398,9 +140801,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -120409,12 +140812,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -120425,13 +140830,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -120451,8 +140857,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -120470,8 +140876,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 774 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM2 + SolutionIndex: 897 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120479,24 +140885,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 2 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120514,43 +140918,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1032 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -120562,9 +140962,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -120573,12 +140973,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -120589,13 +140991,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -120615,8 +141018,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -120634,33 +141037,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 775 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM2 + SolutionIndex: 898 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120679,42 +141080,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 + LSCA: 64 LSCB: 8 LSPA: 4 LSPB: 32 - LVCA: 64 - LVCB: 4 - LVPA: 1 - LVPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3080 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -120726,10 +141127,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120737,12 +141138,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -120760,6 +141163,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -120779,8 +141183,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -120798,33 +141202,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 776 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x32x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG64_4_1_WGM2 + SolutionIndex: 899 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 64 - SubGroup1: 4 - SubGroupA: 64 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120842,43 +141244,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 776 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -120890,9 +141288,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -120902,12 +141300,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -120917,13 +141317,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -120943,8 +141344,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -120962,33 +141363,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 777 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS1_PGR1_SNLL1_TT4_4_WG32_8_1_WGM4 + SolutionIndex: 900 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121006,43 +141405,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 648 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -121054,10 +141449,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121065,13 +141460,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -121081,13 +141478,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -121107,8 +141505,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121126,33 +141524,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 778 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM7 + SolutionIndex: 901 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 7 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121170,43 +141566,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 648 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -121218,10 +141610,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121229,13 +141621,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -121245,13 +141639,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -121271,8 +141666,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121290,33 +141685,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 779 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM15 + SolutionIndex: 902 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 15 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121335,42 +141728,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3080 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -121382,9 +141775,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -121393,12 +141786,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -121416,6 +141811,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -121435,8 +141831,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121454,8 +141850,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 780 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM16 + SolutionIndex: 903 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -121464,23 +141860,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121498,43 +141892,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 776 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -121546,10 +141936,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121557,13 +141947,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -121573,13 +141965,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -121599,8 +141992,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121618,33 +142011,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 781 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM17 + SolutionIndex: 904 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 17 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121663,38 +142054,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 + LSCA: 64 LSCB: 8 - LSPA: 4 + LSPA: 2 LSPB: 16 LVCA: 64 - LVCB: 4 - LVPA: 1 - LVPB: 8 + LVCB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2176 + LdsNumElements: 648 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -121706,9 +142097,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 256 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 256 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -121717,13 +142108,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -121740,6 +142133,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -121759,8 +142153,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121778,33 +142172,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 782 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM17 + SolutionIndex: 905 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 64 - SubGroup1: 4 - SubGroupA: 64 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 17 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121816,49 +142208,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2064 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -121869,10 +142257,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -121881,12 +142269,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -121897,13 +142287,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -121923,8 +142314,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121942,33 +142333,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 783 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM32 + SolutionIndex: 906 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121980,45 +142369,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 - LSCB: 8 + LSCA: 64 + LSCB: 16 LSPA: 4 LSPB: 16 LVCA: 64 - LVCB: 4 - LVPA: 1 - LVPB: 8 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2176 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -122029,11 +142418,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 16 - MacroTileA: 256 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122041,12 +142430,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -122058,12 +142449,13 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -122083,8 +142475,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122102,33 +142494,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 784 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM32 + SolutionIndex: 907 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 64 - SubGroup1: 4 - SubGroupA: 64 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122146,43 +142536,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -122195,9 +142581,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122205,12 +142591,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -122221,13 +142609,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -122247,8 +142636,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122266,8 +142655,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 785 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM2 + SolutionIndex: 908 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -122275,24 +142664,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 2 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122310,58 +142697,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122370,12 +142753,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -122385,13 +142770,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -122411,8 +142797,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122430,8 +142816,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 786 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM2 + SolutionIndex: 909 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -122439,24 +142825,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 2 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122474,43 +142858,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -122523,9 +142903,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122533,13 +142913,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -122549,13 +142931,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -122575,8 +142958,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122594,33 +142977,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 787 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_16_1_WGM7 + SolutionIndex: 910 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 7 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122638,58 +143019,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122697,13 +143074,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -122713,13 +143092,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -122739,8 +143119,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122758,8 +143138,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 788 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM7 + SolutionIndex: 911 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -122767,24 +143147,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 7 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122802,54 +143180,50 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2064 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -122862,11 +143236,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -122877,13 +143253,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -122903,8 +143280,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122922,33 +143299,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 789 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM11 + SolutionIndex: 912 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 11 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122966,43 +143341,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123015,9 +143386,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123025,12 +143396,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -123041,13 +143414,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123067,8 +143441,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123086,8 +143460,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 790 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM16 + SolutionIndex: 913 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -123095,24 +143469,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123130,58 +143502,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123189,13 +143557,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -123205,13 +143575,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123231,8 +143602,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123250,8 +143621,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 791 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + SolutionIndex: 914 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -123259,24 +143630,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123288,60 +143657,56 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3104 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -123354,11 +143719,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -123369,13 +143736,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123395,8 +143763,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123414,33 +143782,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 792 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_8_2_WGM64 + SolutionIndex: 915 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123452,64 +143818,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123519,11 +143881,13 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -123533,13 +143897,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123559,8 +143924,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123578,8 +143943,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 793 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + SolutionIndex: 916 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -123587,24 +143952,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123622,43 +143985,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123670,10 +144029,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123681,13 +144040,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -123697,13 +144058,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123723,8 +144085,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123742,33 +144104,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 794 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM8 + SolutionIndex: 917 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123786,43 +144146,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1600 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123835,9 +144191,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123847,11 +144203,13 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -123861,13 +144219,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123887,8 +144246,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123906,33 +144265,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 795 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM1 + SolutionIndex: 918 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123944,16 +144301,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123969,24 +144326,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 16 LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123997,11 +144350,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124009,13 +144362,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -124025,13 +144380,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124051,8 +144407,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124070,20 +144426,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 796 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM1 + SolutionIndex: 919 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -124091,12 +144447,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124108,16 +144462,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -124133,24 +144487,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 16 - LSPB: 128 - LVCA: 32 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 LVCB: 4 - LVPA: 4 - LVPB: 32 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -124161,11 +144511,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124173,13 +144523,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -124189,13 +144539,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124215,8 +144566,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124234,20 +144585,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 797 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_32_1_WGM1 + SolutionIndex: 920 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 32 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -124255,10 +144606,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 32, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -124272,47 +144623,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -124325,11 +144672,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124337,8 +144684,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -124353,7 +144700,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -124379,8 +144726,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124398,8 +144745,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 798 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM8 + SolutionIndex: 921 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -124407,22 +144754,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -124442,9 +144789,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -124452,48 +144799,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 8 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 16 + LVCA: 8 LVCB: 4 - LVPA: 2 - LVPB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 128 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124501,10 +144844,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -124519,14 +144860,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124546,8 +144886,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124565,31 +144905,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 799 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 922 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124601,15 +144943,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -124617,31 +144959,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -124654,11 +144992,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124666,8 +145004,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -124682,14 +145020,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124709,8 +145046,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124728,8 +145065,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 800 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 923 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -124737,22 +145074,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -124766,15 +145103,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -124782,31 +145119,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -124819,11 +145152,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124831,10 +145164,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -124849,14 +145180,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124876,8 +145206,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124895,8 +145225,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 801 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 924 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -124904,22 +145234,24 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124931,15 +145263,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -124947,37 +145279,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 - LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsNumElements: 512 + LdsOffsetA: 0 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -124985,10 +145313,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124996,13 +145324,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -125012,14 +145340,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125039,8 +145366,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125058,31 +145385,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 802 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 925 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -125102,58 +145429,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125161,15 +145484,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -125179,14 +145500,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125206,8 +145526,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125225,31 +145545,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 803 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 926 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125269,56 +145591,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125326,12 +145648,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -125349,7 +145671,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125369,8 +145690,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125388,28 +145709,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 804 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 927 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -125432,58 +145753,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125491,14 +145808,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -125509,14 +145824,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125536,8 +145850,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125555,31 +145869,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 805 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 928 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125597,41 +145913,37 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -125645,10 +145957,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125656,12 +145968,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -125672,14 +145984,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125699,8 +146010,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125718,8 +146029,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 806 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 929 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125727,18 +146038,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -125756,49 +146067,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3344 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125809,10 +146116,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 96 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -125821,11 +146128,9 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 @@ -125839,14 +146144,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125866,8 +146170,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125885,8 +146189,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 807 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 930 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125894,22 +146198,24 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [6, 4] - ThreadTile0: 6 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125928,38 +146234,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125971,9 +146277,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -125982,14 +146288,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -126007,7 +146311,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126027,8 +146330,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126046,8 +146349,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 808 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 931 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -126056,21 +146359,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126089,53 +146394,53 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 4 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126143,14 +146448,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -126162,13 +146465,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126188,8 +146490,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126207,31 +146509,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 809 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 932 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126243,13 +146547,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -126263,33 +146567,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3344 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -126297,10 +146597,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126308,11 +146608,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -126324,14 +146624,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126351,8 +146650,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126370,32 +146669,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 810 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 933 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -126408,14 +146707,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -126434,38 +146733,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6688 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126473,14 +146772,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -126492,13 +146789,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126518,8 +146814,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126537,31 +146833,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 811 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 934 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126573,14 +146871,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -126593,44 +146891,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6688 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1600 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126638,15 +146932,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -126656,14 +146948,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126683,8 +146974,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126702,31 +146993,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 812 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 935 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126738,7 +147031,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -126758,44 +147051,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6688 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1600 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126803,13 +147096,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -126820,13 +147113,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126846,8 +147138,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126865,31 +147157,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 813 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 936 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -126903,15 +147195,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -126919,37 +147211,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -126957,10 +147245,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126968,15 +147256,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -126986,14 +147272,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127013,8 +147298,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127032,31 +147317,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 814 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 937 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127068,7 +147355,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -127076,7 +147363,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -127084,37 +147371,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -127122,10 +147409,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127133,13 +147420,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -127156,7 +147443,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127176,8 +147462,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127195,31 +147481,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 815 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 938 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB0_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -127233,15 +147519,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -127249,33 +147535,29 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1056 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -127286,11 +147568,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127298,10 +147580,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -127316,14 +147596,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127343,8 +147622,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127362,8 +147641,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 816 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 939 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127371,22 +147650,24 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127398,15 +147679,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -127414,33 +147695,29 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -127451,10 +147728,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -127463,12 +147740,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -127479,14 +147756,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127506,8 +147782,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127525,8 +147801,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 817 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 940 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127534,22 +147810,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -127570,38 +147846,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -127613,9 +147889,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -127624,14 +147900,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -127649,7 +147923,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127669,8 +147942,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127688,8 +147961,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 818 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 941 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127698,21 +147971,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127730,58 +148005,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 544 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127789,15 +148060,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -127807,14 +148076,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127834,8 +148102,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127853,31 +148121,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 819 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 942 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127895,58 +148165,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127954,11 +148220,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -127970,14 +148236,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127997,8 +148262,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128016,28 +148281,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 820 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 943 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -128060,54 +148325,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 16 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128115,10 +148384,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -128133,14 +148400,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128160,8 +148426,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128179,31 +148445,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 821 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 944 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128215,49 +148483,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -128268,11 +148536,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128280,14 +148548,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -128305,7 +148571,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128325,8 +148590,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128344,8 +148609,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 822 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 945 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -128354,21 +148619,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128380,7 +148647,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -128388,56 +148655,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128445,11 +148712,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -128462,13 +148729,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128488,8 +148754,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128507,31 +148773,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 823 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 946 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -128545,49 +148811,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 1088 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -128595,10 +148861,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128606,15 +148872,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128631,7 +148895,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128651,8 +148914,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128670,31 +148933,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 824 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 947 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128706,49 +148971,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -128756,9 +149025,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -128767,15 +149036,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128785,14 +149052,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128812,8 +149078,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128831,31 +149097,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 825 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 948 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128867,49 +149135,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -128917,9 +149189,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -128928,15 +149200,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128946,14 +149216,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128973,8 +149242,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128992,31 +149261,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 826 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 949 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129028,59 +149299,59 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 LVCB: 16 - LVPA: 2 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 2048 LdsOffsetA: 0 LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -129089,15 +149360,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129114,7 +149383,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129134,8 +149402,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129153,14 +149421,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 827 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 950 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] @@ -129169,15 +149437,17 @@ ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129189,59 +149459,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 LVCB: 16 - LVPA: 2 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -129250,15 +149524,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129268,14 +149540,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129295,8 +149566,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129314,31 +149585,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 828 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 951 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129350,15 +149623,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -129366,37 +149639,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 3648 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -129404,9 +149677,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -129415,15 +149688,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129434,13 +149705,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129460,8 +149730,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129479,31 +149749,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 829 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 952 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129515,15 +149787,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -129531,37 +149803,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -129569,9 +149837,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -129580,13 +149848,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129596,14 +149864,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129623,8 +149890,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129642,31 +149909,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 830 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 953 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -129680,53 +149947,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3648 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -129734,10 +150001,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129745,15 +150012,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129770,7 +150035,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129790,8 +150054,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129809,31 +150073,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 831 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 954 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129845,7 +150111,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -129853,7 +150119,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -129861,37 +150127,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 1856 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -129899,10 +150165,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129910,11 +150176,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -129933,7 +150199,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129953,8 +150218,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129972,31 +150237,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 832 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 955 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -130010,45 +150275,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130059,7 +150328,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -130071,15 +150340,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -130089,14 +150356,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130116,8 +150382,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130135,31 +150401,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 833 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 956 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130171,7 +150439,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -130179,52 +150447,52 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130232,13 +150500,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -130255,7 +150523,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130275,8 +150542,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130294,31 +150561,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 834 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL1_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 957 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -130332,49 +150599,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1032 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -130382,10 +150653,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130393,14 +150664,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -130411,14 +150680,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130438,8 +150706,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130457,31 +150725,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 835 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 958 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130493,49 +150763,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1032 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -130543,10 +150813,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130554,14 +150824,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -130579,7 +150847,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130599,8 +150866,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130618,31 +150885,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 836 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 959 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130654,53 +150923,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3080 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -130708,10 +150977,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130719,14 +150988,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -130744,7 +151011,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130764,8 +151030,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130783,31 +151049,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 837 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 960 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130819,49 +151087,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 776 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -130869,9 +151141,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -130880,15 +151152,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -130898,14 +151168,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130925,8 +151194,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130944,31 +151213,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 838 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 961 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130980,49 +151251,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 648 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -131030,10 +151305,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131041,15 +151316,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131059,14 +151332,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131086,8 +151358,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131105,31 +151377,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 839 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 962 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131141,49 +151415,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 648 + LdsNumElements: 832 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -131191,10 +151465,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131202,13 +151476,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -131227,7 +151499,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131247,8 +151518,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131266,31 +151537,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 840 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 963 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131302,53 +151575,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3080 + LdsNumElements: 1856 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -131356,10 +151629,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131367,15 +151640,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131392,7 +151663,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131412,8 +151682,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131431,31 +151701,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 841 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 964 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131467,60 +151739,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 776 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131528,13 +151804,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 @@ -131546,14 +151820,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131573,8 +151846,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131592,31 +151865,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 842 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 965 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131628,49 +151903,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 648 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -131678,9 +151957,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -131689,15 +151968,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131707,14 +151984,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131734,8 +152010,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131753,31 +152029,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 843 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 966 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131789,60 +152067,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2064 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131850,14 +152132,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -131868,14 +152148,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131895,8 +152174,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131914,31 +152193,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 844 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 967 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131950,59 +152231,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -132011,13 +152296,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -132029,14 +152312,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132056,8 +152338,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132075,31 +152357,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 845 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 968 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132111,60 +152395,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132172,15 +152460,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132190,14 +152476,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132217,8 +152502,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132236,31 +152521,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 846 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 969 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132272,60 +152559,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 64 + LVCA: 8 LVCB: 16 - LVPA: 2 - LVPB: 8 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132333,14 +152624,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -132351,14 +152640,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132378,8 +152666,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132397,31 +152685,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 847 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 970 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132433,59 +152723,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 64 + LVCA: 8 LVCB: 16 - LVPA: 2 - LVPB: 8 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -132494,13 +152788,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 @@ -132512,14 +152804,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132539,8 +152830,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132558,31 +152849,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 848 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 971 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132594,59 +152887,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 2 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -132655,15 +152952,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132673,14 +152968,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132700,8 +152994,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132719,31 +153013,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 849 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 972 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132755,60 +153051,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2064 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132816,14 +153112,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -132841,7 +153135,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132861,8 +153154,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132880,31 +153173,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 850 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 973 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132916,60 +153211,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132977,14 +153276,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -132995,14 +153292,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133022,8 +153318,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133041,31 +153337,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 851 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 974 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133077,59 +153375,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 LVCB: 16 - LVPA: 2 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -133138,15 +153440,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -133156,14 +153456,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133183,8 +153482,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133202,14 +153501,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 852 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 975 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] @@ -133218,15 +153517,17 @@ ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133244,54 +153545,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 4 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3104 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133299,14 +153604,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -133317,14 +153620,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133344,8 +153646,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133363,15 +153665,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 853 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 976 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] ThreadTile0: 4 @@ -133379,15 +153681,17 @@ ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133399,49 +153703,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -133449,9 +153757,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -133460,15 +153768,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -133478,14 +153784,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133505,8 +153810,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133524,31 +153829,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 854 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 977 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133560,49 +153867,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -133610,10 +153921,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133621,14 +153932,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -133639,14 +153948,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133666,8 +153974,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133685,31 +153993,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 855 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 978 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133721,60 +154031,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 64 + LVCA: 8 LVCB: 16 - LVPA: 2 - LVPB: 8 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1600 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133782,14 +154096,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -133800,14 +154112,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133827,8 +154138,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133846,31 +154157,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 856 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 979 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133882,15 +154195,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -133898,29 +154211,29 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -133931,10 +154244,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -133943,14 +154256,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -133962,13 +154273,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133988,8 +154298,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134007,31 +154317,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 857 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 980 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -134043,7 +154355,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -134051,7 +154363,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -134059,29 +154371,29 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -134092,11 +154404,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134104,12 +154416,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -134127,7 +154439,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -134147,8 +154458,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134166,31 +154477,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 858 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 981 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134204,7 +154515,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -134224,39 +154535,39 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCA: 8 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 128 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -134265,8 +154576,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -134282,7 +154593,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -134307,8 +154618,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134326,14 +154637,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 859 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 982 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] @@ -134347,10 +154658,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134364,7 +154675,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -134389,14 +154700,14 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 - LVCA: 8 - LVCB: 4 - LVPA: 4 - LVPB: 8 + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 1024 LdsOffsetA: 0 @@ -134413,11 +154724,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134442,7 +154753,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -134467,8 +154778,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134486,15 +154797,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 860 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 983 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -134507,10 +154818,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134544,40 +154855,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 4 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 2 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 64 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134585,12 +154896,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -134627,8 +154938,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134646,15 +154957,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 861 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 984 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -134667,7 +154978,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [2, 32, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -134684,7 +154995,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -134709,35 +155020,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 64 + MacroTileA: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134745,8 +155056,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -134787,8 +155098,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134806,31 +155117,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 862 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 985 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134878,26 +155189,26 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 800 LdsOffsetA: 0 LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 16 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 16 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134905,12 +155216,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -134947,8 +155258,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134966,15 +155277,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 863 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 986 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -134987,7 +155298,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -135024,40 +155335,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 8 LSCB: 16 LSPA: 16 - LSPB: 16 - LVCA: 8 + LSPB: 32 + LVCA: 4 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135065,13 +155376,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -135082,7 +155393,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -135107,8 +155418,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135126,15 +155437,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 864 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 987 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -135147,7 +155458,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -135170,7 +155481,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -135189,24 +155500,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 8 LSCB: 16 LSPA: 16 - LSPB: 16 - LVCA: 8 + LSPB: 32 + LVCA: 4 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -135218,10 +155525,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135245,7 +155552,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -135271,8 +155578,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135290,16 +155597,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 865 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 988 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -135311,7 +155618,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -135353,20 +155660,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 8 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 4 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -135378,10 +155685,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 8 + MacroTile1: 64 + MacroTileA: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135394,7 +155701,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -135406,7 +155713,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -135431,8 +155738,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135450,15 +155757,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 866 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 989 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 4] ThreadTile0: 2 @@ -135471,7 +155778,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -135508,40 +155815,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 4 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 2 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 1120 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135549,12 +155856,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -135591,8 +155898,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135610,15 +155917,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 867 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM8 + SolutionIndex: 990 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -135631,8 +155938,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -135668,39 +155975,39 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 4 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 2 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 1120 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 4 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 4 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -135709,8 +156016,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -135726,7 +156033,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -135751,8 +156058,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135770,29 +156077,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 868 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 991 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -135814,7 +156121,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -135828,39 +156135,43 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 4 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 2 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 64 + LdsOffsetB_Blk: 2112 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 4 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 4 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -135869,8 +156180,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -135885,7 +156196,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -135911,8 +156222,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135930,29 +156241,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 869 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 992 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -135974,7 +156285,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -135993,20 +156304,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 4 LSCB: 16 LSPA: 16 - LSPB: 16 - LVCA: 8 + LSPB: 32 + LVCA: 2 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 64 + LdsOffsetB_Blk: 2112 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -136018,10 +156333,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136034,7 +156349,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -136045,8 +156360,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -136071,8 +156386,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136090,16 +156405,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 870 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 993 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: true ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -136111,7 +156426,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [2, 32, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -136153,20 +156468,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 8 LSCB: 32 LSPA: 32 LSPB: 16 - LVCA: 8 + LVCA: 4 LVCB: 16 LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 1344 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -136178,10 +156493,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136194,7 +156509,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -136206,7 +156521,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -136231,8 +156546,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136250,15 +156565,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 871 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 994 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -136271,7 +156586,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -136313,24 +156628,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -136342,10 +156657,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136357,8 +156672,8 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -136395,8 +156710,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136414,15 +156729,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 872 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 995 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 2] ThreadTile0: 4 @@ -136435,7 +156750,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -136458,7 +156773,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -136479,33 +156794,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 8 + LSPA: 32 + LSPB: 16 LVCA: 8 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 16 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136517,9 +156836,9 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -136529,8 +156848,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -136555,8 +156874,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136574,20 +156893,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 873 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 996 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -136595,7 +156914,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [4, 8, 8] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -136639,37 +156958,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 8 + LSPA: 32 + LSPB: 16 LVCA: 8 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 3136 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 16 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136681,9 +157000,9 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -136719,8 +157038,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136738,20 +157057,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 874 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 997 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -136759,7 +157078,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [4, 8, 8] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -136801,35 +157120,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 8 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136841,9 +157160,9 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -136879,8 +157198,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136898,29 +157217,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 875 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 998 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [4, 8, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -136953,7 +157272,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -136961,39 +157280,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 8 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137005,9 +157324,9 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -137043,8 +157362,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137062,28 +157381,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 876 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB0_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 999 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [4, 8, 8] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -137100,7 +157419,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -137120,40 +157439,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 4 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1056 + LdsNumElements: 2240 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 128 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137161,12 +157480,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -137203,8 +157522,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137222,15 +157541,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 877 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1000 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -137243,10 +157562,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [2, 32, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137260,13 +157579,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -137285,35 +157604,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137321,8 +157644,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -137337,8 +157660,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -137363,8 +157686,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137382,31 +157705,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 878 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1001 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137420,13 +157743,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -137440,40 +157763,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137481,12 +157808,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -137497,7 +157824,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -137523,8 +157850,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137542,31 +157869,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 879 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1002 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 8, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137580,7 +157907,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -137600,21 +157927,21 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 544 + LdsNumElements: 2048 LdsOffsetA: 0 LdsOffsetB: 256 LdsPadA: 0 @@ -137622,18 +157949,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137641,13 +157968,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -137683,8 +158010,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137702,31 +158029,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 880 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 1003 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [4, 8, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137740,13 +158067,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -137760,29 +158087,33 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -137790,10 +158121,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137801,12 +158132,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -137817,7 +158148,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -137843,8 +158174,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137862,31 +158193,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 881 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1004 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [4, 8, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137900,7 +158231,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -137920,33 +158251,33 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -137954,10 +158285,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137965,12 +158296,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138007,8 +158338,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138026,15 +158357,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 882 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG2_16_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 2] ThreadTile0: 4 @@ -138047,10 +158378,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [2, 16, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -138084,7 +158415,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -138092,36 +158423,36 @@ LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3328 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138129,12 +158460,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138171,8 +158502,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138190,15 +158521,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 883 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1006 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: true ThreadTile: [2, 2] ThreadTile0: 2 @@ -138211,7 +158542,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -138253,22 +158584,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPA: 16 + LSPB: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -138282,10 +158613,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138297,7 +158628,7 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -138310,7 +158641,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -138335,8 +158666,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138354,15 +158685,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 884 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1007 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: true ThreadTile: [2, 2] ThreadTile0: 2 @@ -138375,7 +158706,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -138398,7 +158729,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -138412,23 +158743,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -138442,9 +158777,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -138453,11 +158788,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -138469,7 +158804,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -138495,8 +158830,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138514,28 +158849,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 885 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1008 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -138572,43 +158907,43 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -138617,11 +158952,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -138659,8 +158994,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138678,14 +159013,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 886 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1009 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [2, 2] @@ -138699,7 +159034,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -138750,30 +159085,30 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138781,12 +159116,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138798,7 +159133,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -138823,8 +159158,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138842,28 +159177,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 887 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1010 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -138886,7 +159221,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -138900,7 +159235,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -138908,32 +159243,36 @@ LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138941,7 +159280,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -138957,7 +159296,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -138983,8 +159322,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139002,16 +159341,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 888 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1011 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroupB: 4 + SuppressNoLoadLoop: true ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 @@ -139023,7 +159362,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -139060,43 +159399,43 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -139105,11 +159444,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -139147,8 +159486,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139166,28 +159505,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 889 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1012 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -139224,27 +159563,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPA: 16 + LSPB: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -139258,10 +159597,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139269,12 +159608,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -139286,7 +159625,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -139311,8 +159650,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139330,29 +159669,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 890 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1013 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -139374,7 +159713,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -139393,18 +159732,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -139418,10 +159761,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139433,8 +159776,8 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -139445,7 +159788,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -139471,8 +159814,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139490,16 +159833,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 891 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1014 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: true ThreadTile: [2, 4] ThreadTile0: 2 ThreadTile1: 4 @@ -139511,8 +159854,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -139553,39 +159896,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 32 + LSPA: 8 LSPB: 16 - LVCA: 8 + LVCA: 32 LVCB: 16 - LVPA: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139593,12 +159936,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -139635,8 +159978,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139654,29 +159997,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 892 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1015 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -139719,37 +160062,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 8 + LSPA: 32 + LSPB: 16 LVCA: 8 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1856 + LdsNumElements: 3136 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 16 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139761,9 +160104,9 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -139799,8 +160142,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139818,8 +160161,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 893 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1016 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -139828,19 +160171,19 @@ SubGroupA: 8 SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -139883,12 +160226,12 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 16 + LSPA: 16 + LSPB: 32 LVCA: 32 LVCB: 16 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 @@ -139902,14 +160245,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -139921,13 +160264,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -139963,8 +160306,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139982,28 +160325,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 894 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM8 + SolutionIndex: 1017 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG32_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -140026,7 +160369,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -140037,7 +160380,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -140045,18 +160388,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPA: 16 + LSPB: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -140070,10 +160417,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140085,7 +160432,7 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -140097,8 +160444,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -140123,8 +160470,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140142,15 +160489,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 895 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1018 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -140163,8 +160510,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -140197,7 +160544,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -140226,14 +160573,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 MacroTile1: 16 MacroTileA: 16 @@ -140262,7 +160609,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -140287,8 +160634,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140306,29 +160653,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 896 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1019 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -140361,42 +160708,42 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -140405,11 +160752,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -140447,8 +160794,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140466,29 +160813,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 897 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1020 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -140521,46 +160868,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -140569,11 +160916,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -140611,8 +160958,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140630,29 +160977,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 898 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1021 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -140685,10 +161032,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -140702,9 +161049,9 @@ LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3136 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -140714,18 +161061,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 16 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140733,12 +161080,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -140750,7 +161097,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -140775,8 +161122,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140794,16 +161141,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 899 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1022 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroupB: 4 + SuppressNoLoadLoop: false ThreadTile: [2, 4] ThreadTile0: 2 ThreadTile1: 4 @@ -140815,7 +161162,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 4, 8] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -140832,7 +161179,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -140857,28 +161204,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -140886,10 +161233,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140897,12 +161244,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -140939,8 +161286,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140958,20 +161305,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 900 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1023 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -140979,10 +161326,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -140996,13 +161343,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -141016,29 +161363,33 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 832 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141046,10 +161397,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141057,13 +161408,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141073,7 +161424,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -141099,8 +161450,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141118,31 +161469,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 901 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1024 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -141156,13 +161507,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -141176,33 +161527,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141210,10 +161557,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141221,13 +161568,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141237,8 +161584,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -141263,8 +161610,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141282,31 +161629,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 902 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1025 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -141320,13 +161667,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -141337,7 +161684,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -141345,28 +161692,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141374,10 +161717,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141385,13 +161728,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141401,8 +161744,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -141427,8 +161770,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141446,31 +161789,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 903 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 1026 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -141484,7 +161827,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -141492,56 +161835,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141549,11 +161892,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -141566,7 +161909,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -141591,8 +161934,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141610,31 +161953,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 904 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1027 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -141648,7 +161991,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -141656,56 +161999,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141713,8 +162056,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -141755,8 +162098,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141774,31 +162117,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 905 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1028 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -141820,56 +162163,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141877,11 +162220,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -141919,8 +162262,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141938,29 +162281,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 906 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1029 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -141976,7 +162319,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -141993,36 +162336,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1856 + LdsNumElements: 3088 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -142030,10 +162373,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142041,13 +162384,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142058,7 +162401,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -142083,8 +162426,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142102,31 +162445,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 907 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1030 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -142140,13 +162483,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -142157,36 +162500,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 1040 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -142194,10 +162533,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142205,13 +162544,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142221,7 +162560,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -142247,8 +162586,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142266,31 +162605,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 908 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1031 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -142304,7 +162643,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -142321,7 +162660,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -142329,16 +162668,16 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 3088 LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 @@ -142350,7 +162689,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -142358,10 +162697,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142369,13 +162708,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142411,8 +162750,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142430,20 +162769,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 909 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_4_WGM1 + SolutionIndex: 1032 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -142451,10 +162790,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -142468,7 +162807,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -142485,30 +162824,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -142521,11 +162860,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142533,13 +162872,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142550,7 +162889,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -142575,8 +162914,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142594,31 +162933,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 910 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1033 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -142632,7 +162971,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -142649,43 +162988,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142693,12 +163032,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -142735,8 +163074,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142754,31 +163093,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 911 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1034 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -142792,7 +163131,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -142809,30 +163148,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -142845,11 +163184,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142857,13 +163196,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142899,8 +163238,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142918,31 +163257,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 912 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1035 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -142956,13 +163295,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -142973,7 +163312,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -142982,21 +163321,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -143009,11 +163344,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143021,13 +163356,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143037,7 +163372,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -143063,8 +163398,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143082,31 +163417,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 913 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1036 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG8_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -143120,7 +163455,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -143137,7 +163472,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -143146,21 +163481,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -143173,11 +163508,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143185,13 +163520,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143227,8 +163562,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143246,31 +163581,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 914 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1037 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -143284,13 +163619,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -143301,7 +163636,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -143309,39 +163644,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143349,11 +163680,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -143365,8 +163696,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -143391,8 +163722,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143410,20 +163741,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 915 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1038 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -143431,10 +163762,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -143448,13 +163779,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -143465,47 +163796,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143513,13 +163840,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143529,8 +163856,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -143555,8 +163882,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143574,31 +163901,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 916 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1039 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -143629,30 +163956,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPA: 32 + LSPB: 64 + LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -143666,10 +163993,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143677,13 +164004,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143719,8 +164046,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143738,29 +164065,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 917 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1040 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -143776,13 +164103,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -143801,34 +164128,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSPB: 32 + LVCA: 32 + LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -143841,9 +164172,9 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143853,8 +164184,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -143879,8 +164210,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143898,31 +164229,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 918 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 + SolutionIndex: 1041 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 1] + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -143936,13 +164267,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -143961,24 +164292,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 128 + LSCB: 32 LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSPB: 64 + LVCA: 64 + LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -143986,9 +164321,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -144001,9 +164336,9 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144013,7 +164348,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -144039,8 +164374,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144058,20 +164393,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 919 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_16_1_WGM1 + SolutionIndex: 1042 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG32_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 32 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -144079,10 +164414,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 1] + WorkGroup: [32, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144096,13 +164431,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -144116,25 +164451,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 32 + LVCB: 16 + LVPA: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -144145,11 +164484,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144157,13 +164496,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144173,8 +164512,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -144199,8 +164538,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144218,31 +164557,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 920 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1043 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144256,13 +164595,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -144276,25 +164615,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 4 - LVCB: 8 + LVCA: 32 + LVCB: 16 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -144305,10 +164648,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -144317,13 +164660,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144333,7 +164676,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -144359,8 +164702,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144378,31 +164721,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 921 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1044 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144422,53 +164765,57 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 2 - LVCB: 8 - LVPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -144477,12 +164824,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -144493,8 +164840,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -144519,8 +164866,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144538,28 +164885,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 922 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1045 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -144576,59 +164923,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 128 + LSCB: 16 LSPA: 8 LSPB: 64 - LVCA: 4 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 32 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 8 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -144637,11 +164988,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -144653,7 +165004,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -144679,8 +165030,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144698,31 +165049,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 923 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1046 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144744,37 +165095,37 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -144786,10 +165137,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144797,13 +165148,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144839,8 +165190,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144858,29 +165209,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 924 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 + SolutionIndex: 1047 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -144902,54 +165253,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144957,8 +165312,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -144973,8 +165328,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -144999,8 +165354,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145018,29 +165373,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 925 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1048 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -145062,54 +165417,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145117,12 +165476,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -145133,7 +165492,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -145159,8 +165518,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145178,29 +165537,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 926 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1049 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -145222,53 +165581,57 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 8 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -145277,12 +165640,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -145293,8 +165656,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -145319,8 +165682,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145338,29 +165701,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 927 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1050 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -145376,59 +165739,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 2 + LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1120 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -145437,11 +165804,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -145453,8 +165820,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -145479,8 +165846,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145498,31 +165865,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 928 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1051 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145536,15 +165903,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -145552,29 +165919,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 - LSPA: 16 + LSCA: 64 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 2 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1120 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -145585,11 +165956,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 - MacroTile1: 64 - MacroTileA: 4 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145597,13 +165968,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -145613,7 +165984,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -145639,8 +166010,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145658,31 +166029,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 929 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1052 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145696,7 +166067,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -145704,41 +166075,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 2 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3168 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 64 - LdsOffsetB_Blk: 2112 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -145749,10 +166120,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -145761,13 +166132,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -145803,8 +166174,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145822,31 +166193,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 930 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1053 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145860,7 +166231,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -145868,7 +166239,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -145876,33 +166247,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 + LSCA: 128 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 2 - LVCB: 8 - LVPA: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3168 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 64 - LdsOffsetB_Blk: 2112 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -145913,11 +166284,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 - MacroTile1: 64 - MacroTileA: 4 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145925,13 +166296,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -145967,8 +166338,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145986,31 +166357,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 931 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM8 + SolutionIndex: 1054 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -146030,39 +166401,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSPB: 64 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1344 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -146074,10 +166449,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146085,13 +166460,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -146101,7 +166476,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -146127,8 +166502,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146146,28 +166521,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 932 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1055 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [8, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -146192,7 +166567,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -146200,48 +166575,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPB: 64 + LVCA: 32 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146249,13 +166624,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -146291,8 +166666,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146310,28 +166685,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 933 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1056 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 32 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 32 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [32, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -146356,7 +166731,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -146364,48 +166739,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPB: 32 + LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146413,13 +166788,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -146430,7 +166805,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -146455,8 +166830,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146474,29 +166849,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 934 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 + SolutionIndex: 1057 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -146520,56 +166895,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSPB: 64 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146577,13 +166952,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -146619,8 +166994,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146638,29 +167013,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 935 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 + SolutionIndex: 1058 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -146682,9 +167057,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -146692,44 +167067,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 128 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 4 + LSPB: 64 + LVCA: 32 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146737,13 +167116,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -146753,7 +167132,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -146779,8 +167158,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146798,29 +167177,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 936 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM1 + SolutionIndex: 1059 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 32 + SubGroup1: 16 + SubGroupA: 32 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -146844,56 +167223,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146901,11 +167280,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -146943,8 +167322,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146962,29 +167341,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 937 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM1 + SolutionIndex: 1060 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -147006,9 +167385,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -147016,29 +167395,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 + LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 2 + LSPB: 32 + LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2240 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -147050,10 +167433,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 4 - MacroTile1: 64 - MacroTileA: 4 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147061,13 +167444,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -147077,8 +167460,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -147103,8 +167486,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147122,29 +167505,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 938 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1061 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR0_SNLL1_TT4_4_VW4_WG16_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -147168,41 +167551,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSPB: 64 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -147214,10 +167597,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147225,13 +167608,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -147267,8 +167650,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147286,29 +167669,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 939 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM8 + SolutionIndex: 1062 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -147332,56 +167715,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSPB: 64 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147389,13 +167772,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -147431,8 +167814,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147450,29 +167833,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 940 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM8 + SolutionIndex: 1063 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -147494,9 +167877,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -147504,44 +167887,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 128 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 4 + LSPB: 64 + LVCA: 32 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147549,13 +167936,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -147565,7 +167952,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -147591,8 +167978,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147610,29 +167997,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 941 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1064 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 32 + SubGroup1: 16 + SubGroupA: 32 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [32, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -147648,13 +168035,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -147668,44 +168055,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 2 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147713,12 +168096,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -147729,8 +168112,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -147755,8 +168138,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147774,20 +168157,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 942 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1065 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 4 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -147795,10 +168178,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -147812,13 +168195,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -147832,44 +168215,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 2 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147877,12 +168256,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -147893,7 +168272,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -147919,8 +168298,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147938,31 +168317,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 943 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG2_16_8_WGM8 + SolutionIndex: 1066 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [2, 16, 8] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -147984,56 +168363,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148041,12 +168420,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -148083,8 +168462,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148102,29 +168481,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 944 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 + SolutionIndex: 1067 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -148140,13 +168519,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -148160,44 +168539,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148205,11 +168580,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -148221,7 +168596,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -148247,8 +168622,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148266,31 +168641,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 945 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 + SolutionIndex: 1068 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -148304,13 +168679,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -148330,38 +168705,34 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148369,11 +168740,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -148385,7 +168756,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -148411,8 +168782,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148430,20 +168801,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 946 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM1 + SolutionIndex: 1069 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -148451,10 +168822,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -148468,7 +168839,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -148494,21 +168865,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -148521,11 +168892,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148533,11 +168904,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -148575,8 +168946,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148594,31 +168965,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 947 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_8_2_WGM1 + SolutionIndex: 1070 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -148632,13 +169003,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -148658,21 +169029,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -148685,11 +169052,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148697,11 +169064,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -148713,7 +169080,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -148739,8 +169106,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148758,20 +169125,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 948 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_8_2_WGM1 + SolutionIndex: 1071 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -148779,10 +169146,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -148796,7 +169163,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -148816,44 +169183,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148861,12 +169228,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -148903,8 +169270,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148922,31 +169289,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 949 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_4_8_WGM1 + SolutionIndex: 1072 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 8] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -148960,13 +169327,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -148980,33 +169347,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149015,9 +169378,9 @@ LoopTail: true LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 16 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149025,12 +169388,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -149041,7 +169404,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -149067,8 +169430,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149086,20 +169449,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 950 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1073 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -149107,10 +169470,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 8] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149124,13 +169487,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -149144,33 +169507,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149179,9 +169538,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149189,12 +169548,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -149205,7 +169564,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -149231,8 +169590,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149250,31 +169609,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 951 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM8 + SolutionIndex: 1074 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149288,7 +169647,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -149314,27 +169673,27 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149343,9 +169702,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149353,12 +169712,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -149395,8 +169754,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149414,20 +169773,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 952 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM8 + SolutionIndex: 1075 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -149435,10 +169794,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149452,7 +169811,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -149477,22 +169836,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -149505,11 +169864,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149517,12 +169876,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -149559,8 +169918,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149578,31 +169937,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 953 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_8_2_WGM8 + SolutionIndex: 1076 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149636,7 +169995,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -149650,9 +170009,9 @@ LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -149671,9 +170030,9 @@ LoopTail: true LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 16 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149681,12 +170040,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -149723,8 +170082,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149742,20 +170101,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 954 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM8 + SolutionIndex: 1077 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -149763,7 +170122,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 8] + WorkGroup: [4, 8, 8] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -149780,7 +170139,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -149788,41 +170147,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -149833,11 +170192,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149845,13 +170204,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -149887,8 +170246,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149906,31 +170265,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 955 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG32_8_2_WGM8 + SolutionIndex: 1078 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149944,7 +170303,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -149952,45 +170311,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149998,10 +170357,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150009,11 +170368,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -150026,7 +170385,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -150051,8 +170410,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150070,31 +170429,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 956 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG16_4_4_WGM1 + SolutionIndex: 1079 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -150108,7 +170467,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -150116,56 +170475,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150173,8 +170532,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -150190,7 +170549,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -150215,8 +170574,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150234,31 +170593,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 957 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1080 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -150278,54 +170637,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150333,12 +170696,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -150349,7 +170712,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -150375,8 +170738,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150394,28 +170757,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 958 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1081 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -150440,56 +170803,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150497,12 +170860,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -150539,8 +170902,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150558,29 +170921,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 959 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1082 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -150604,56 +170967,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150661,7 +171024,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -150703,8 +171066,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150722,29 +171085,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 960 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM8 + SolutionIndex: 1083 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -150760,7 +171123,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -150785,39 +171148,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150825,8 +171188,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -150867,8 +171230,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150886,15 +171249,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 961 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1084 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -150907,10 +171270,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -150924,13 +171287,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -150949,39 +171312,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150989,8 +171348,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -151005,7 +171364,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -151031,8 +171390,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151050,16 +171409,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 962 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1085 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -151071,10 +171430,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151088,13 +171447,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -151113,35 +171472,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151149,8 +171512,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -151165,8 +171528,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -151191,8 +171554,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151210,16 +171573,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 963 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1086 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -151231,10 +171594,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151248,13 +171611,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -151273,35 +171636,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151309,8 +171676,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -151325,8 +171692,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -151351,8 +171718,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151370,16 +171737,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 964 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1087 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -151391,10 +171758,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151408,7 +171775,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -151416,56 +171783,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151473,12 +171840,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -151515,8 +171882,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151534,31 +171901,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 965 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1088 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151572,7 +171939,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -151580,56 +171947,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 16 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151637,12 +172004,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -151679,8 +172046,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151698,15 +172065,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 966 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1089 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -151718,11 +172085,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151736,7 +172103,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -151745,7 +172112,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -151762,38 +172129,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151801,12 +172168,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -151843,8 +172210,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151862,15 +172229,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 967 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1090 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -151883,10 +172250,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151900,7 +172267,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -151908,7 +172275,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -151916,37 +172283,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3088 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -151955,9 +172322,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151965,8 +172332,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -152007,8 +172374,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152026,15 +172393,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 968 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1091 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -152046,11 +172413,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -152064,15 +172431,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -152080,33 +172447,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1040 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -152115,9 +172486,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152125,8 +172496,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -152141,7 +172512,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -152167,8 +172538,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152186,16 +172557,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 969 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1092 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -152206,11 +172577,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -152224,7 +172595,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -152232,45 +172603,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3088 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -152290,7 +172661,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -152331,8 +172702,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152350,31 +172721,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 970 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1093 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -152388,7 +172759,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -152396,52 +172767,52 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -152453,13 +172824,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152495,8 +172866,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152514,8 +172885,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 971 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 + SolutionIndex: 1094 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -152534,11 +172905,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -152552,15 +172923,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -152568,44 +172939,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 + LSCA: 128 + LSCB: 32 + LSPA: 16 LSPB: 32 LVCA: 32 - LVCB: 8 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152613,13 +172988,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152629,7 +173004,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -152655,8 +173030,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152674,31 +173049,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 972 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1095 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG32_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [32, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -152712,7 +173087,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -152720,41 +173095,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 6272 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -152765,10 +173140,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -152778,12 +173153,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152819,8 +173194,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152838,14 +173213,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 973 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 + SolutionIndex: 1096 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] @@ -152858,11 +173233,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -152876,60 +173251,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152937,13 +173316,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152953,8 +173332,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -152979,8 +173358,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152998,16 +173377,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 974 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG8_16_4_WGM8 + SolutionIndex: 1097 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -153018,11 +173397,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153036,49 +173415,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 LVCB: 8 LVPA: 8 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -153089,11 +173464,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153102,12 +173477,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -153117,7 +173492,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -153143,8 +173518,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153162,16 +173537,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 975 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_4_WGM8 + SolutionIndex: 1098 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR1_SNLL0_TT4_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -153182,11 +173557,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 4] + VectorWidth: 4 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153200,60 +173575,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 + LSCA: 32 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153261,12 +173640,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -153277,8 +173656,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -153303,8 +173682,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153322,16 +173701,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 976 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1099 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -153342,11 +173721,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153360,60 +173739,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 8 + LSCB: 32 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153421,12 +173804,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -153437,7 +173820,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -153463,8 +173846,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153482,31 +173865,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 977 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1100 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153528,56 +173911,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 64 - LVCA: 32 - LVCB: 16 - LVPA: 16 - LVPB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 7296 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153585,13 +173968,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -153627,8 +174010,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153646,15 +174029,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 978 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM1 + SolutionIndex: 1101 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -153666,9 +174049,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -153692,41 +174075,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 32 - LVCB: 16 + LVCA: 8 + LVCB: 8 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -153738,9 +174121,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -153750,12 +174133,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -153791,8 +174174,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153810,14 +174193,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 979 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 + SolutionIndex: 1102 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] @@ -153830,9 +174213,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -153854,7 +174237,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -153865,7 +174248,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -153873,22 +174256,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 64 - LVCA: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -153902,10 +174281,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153917,9 +174296,9 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -153929,7 +174308,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -153955,8 +174334,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153974,16 +174353,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 980 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG32_16_2_WGM1 + SolutionIndex: 1103 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 16 - SubGroupA: 32 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -153995,7 +174374,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 16, 2] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -154018,7 +174397,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -154029,7 +174408,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -154039,37 +174418,33 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 64 + LSPA: 8 + LSPB: 16 LVCA: 32 LVCB: 16 - LVPA: 16 - LVPB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154077,13 +174452,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154093,7 +174468,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -154119,8 +174494,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154138,16 +174513,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 981 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM8 + SolutionIndex: 1104 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -154159,7 +174534,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -154193,7 +174568,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -154201,22 +174576,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 32 - LVCA: 32 + LSPB: 16 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -154230,9 +174605,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -154246,8 +174621,8 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154283,8 +174658,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154302,16 +174677,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 982 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 + SolutionIndex: 1105 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -154323,7 +174698,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -154340,7 +174715,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -154348,31 +174723,31 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 16 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 @@ -154382,22 +174757,22 @@ LdsOffsetB: 1024 LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154405,12 +174780,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -154447,8 +174822,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154466,16 +174841,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 983 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1106 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -154486,11 +174861,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -154504,64 +174879,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 16 + LSCB: 8 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 8 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154569,11 +174940,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -154585,8 +174956,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -154611,8 +174982,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154630,31 +175001,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 984 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1107 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -154668,7 +175039,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -154676,37 +175047,37 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -154717,11 +175088,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154729,8 +175100,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -154746,7 +175117,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -154771,8 +175142,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154790,8 +175161,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 985 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1108 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -154800,21 +175171,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -154834,43 +175205,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -154882,10 +175249,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154893,8 +175260,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -154909,8 +175276,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -154935,8 +175302,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154954,8 +175321,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 986 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1109 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -154963,18 +175330,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 16 WorkGroupMappingType: B @@ -154998,43 +175365,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -155046,10 +175409,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155057,12 +175420,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -155073,7 +175436,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -155099,8 +175462,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155118,8 +175481,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 987 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM16 + SolutionIndex: 1110 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -155127,18 +175490,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 16 WorkGroupMappingType: B @@ -155156,49 +175519,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 32 + LSCB: 8 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 528 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -155209,11 +175568,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155221,11 +175580,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -155237,7 +175596,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -155263,8 +175622,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155282,8 +175641,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 988 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1111 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -155291,22 +175650,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155320,64 +175679,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155385,12 +175740,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -155401,8 +175756,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -155427,8 +175782,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155446,31 +175801,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 989 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1112 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155484,15 +175839,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -155500,37 +175855,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 16 - LVCB: 16 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -155538,9 +175889,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -155549,13 +175900,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155565,7 +175916,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -155591,8 +175942,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155610,31 +175961,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 990 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM1 + SolutionIndex: 1113 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155648,7 +175999,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -155656,45 +176007,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 - LSPB: 64 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -155702,10 +176053,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155713,13 +176064,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155755,8 +176106,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155774,31 +176125,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 991 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM1 + SolutionIndex: 1114 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155812,15 +176163,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -155828,48 +176179,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155878,12 +176225,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155893,8 +176240,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -155919,8 +176266,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155938,31 +176285,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 992 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_8_4_WGM1 + SolutionIndex: 1115 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155982,43 +176329,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 LSPA: 32 - LSPB: 64 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -156030,10 +176373,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156041,13 +176384,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156057,7 +176400,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -156083,8 +176426,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156102,29 +176445,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 993 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM1 + SolutionIndex: 1116 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -156148,7 +176491,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -156156,48 +176499,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 32 LSPA: 32 - LSPB: 64 - LVCA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 - LVPB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156205,13 +176548,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156247,8 +176590,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156266,29 +176609,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 994 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM1 + SolutionIndex: 1117 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 16 - SubGroupA: 32 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -156312,7 +176655,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -156320,33 +176663,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -156358,10 +176701,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156369,13 +176712,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156411,8 +176754,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156430,29 +176773,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 995 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM8 + SolutionIndex: 1118 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -156476,41 +176819,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 64 + LSPA: 16 + LSPB: 8 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -156522,10 +176865,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156533,13 +176876,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156575,8 +176918,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156594,29 +176937,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 996 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM8 + SolutionIndex: 1119 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -156640,7 +176983,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -156648,48 +176991,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 64 - LVCA: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156697,13 +177040,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156739,8 +177082,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156758,29 +177101,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 997 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM8 + SolutionIndex: 1120 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 16 - SubGroupA: 32 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -156805,7 +177148,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -156824,36 +177167,36 @@ LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 12416 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156861,12 +177204,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -156903,8 +177246,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156922,15 +177265,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 998 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1121 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -156943,8 +177286,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -156985,38 +177328,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 4096 LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -157025,11 +177368,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 512 PackBatchDims: 0 @@ -157042,7 +177385,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -157067,8 +177410,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157086,14 +177429,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 999 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR0_SNLL1_TT4_4_VW4_WG16_8_4_WGM16 + SolutionIndex: 1122 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] @@ -157107,8 +177450,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [32, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -157133,7 +177476,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -157151,20 +177494,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -157179,9 +177522,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157189,13 +177532,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -157231,8 +177574,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157250,29 +177593,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1000 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM16 + SolutionIndex: 1123 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -157297,7 +177640,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -157315,20 +177658,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -157343,9 +177686,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157353,13 +177696,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -157395,8 +177738,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157414,20 +177757,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1001 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM16 + SolutionIndex: 1124 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -157435,7 +177778,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -157479,16 +177822,16 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 32 - LSPA: 32 - LSPB: 64 + LSPA: 16 + LSPB: 32 LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 13440 LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 LdsOffsetB: 4096 @@ -157507,9 +177850,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157521,9 +177864,9 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -157559,8 +177902,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157578,15 +177921,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1002 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM16 + SolutionIndex: 1125 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 32 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 32 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -157599,7 +177942,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 16, 2] + WorkGroup: [32, 8, 2] WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -157608,7 +177951,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -157616,59 +177959,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 8 + LVCA: 16 LVCB: 4 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -157677,8 +178025,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -157693,13 +178043,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -157719,8 +178071,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157738,14 +178090,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1003 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1126 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -157758,17 +178110,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -157776,59 +178126,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 8 + LVCA: 16 LVCB: 4 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -157837,8 +178192,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -157853,13 +178208,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -157879,8 +178236,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157898,14 +178255,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1004 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1127 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -157918,17 +178275,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -157936,7 +178293,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -157944,55 +178301,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -158001,12 +178359,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -158017,13 +178375,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158043,8 +178403,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158062,37 +178422,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1005 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1128 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158106,7 +178466,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -158124,35 +178484,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 96 LSCB: 8 - LSPA: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 + LVCA: 48 LVCB: 4 - LVPA: 4 + LVPA: 3 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -158161,11 +178526,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -158177,13 +178542,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158203,8 +178570,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158222,20 +178589,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1006 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1129 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -158243,16 +178610,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158266,7 +178633,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -158284,35 +178651,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 96 LSCB: 8 - LSPA: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 + LVCA: 48 LVCB: 4 - LVPA: 4 + LVPA: 3 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -158321,11 +178693,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -158337,13 +178709,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158363,8 +178737,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158382,20 +178756,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1007 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1130 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -158403,16 +178777,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158444,39 +178818,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 96 LSCB: 8 - LSPA: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 + LVCA: 48 LVCB: 4 - LVPA: 4 + LVPA: 3 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -158485,11 +178860,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -158501,6 +178876,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -158508,6 +178884,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158527,8 +178904,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158546,20 +178923,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1008 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1131 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -158567,16 +178944,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158590,8 +178967,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -158608,36 +178985,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158645,12 +179027,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -158661,13 +179045,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158687,8 +179073,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158706,37 +179092,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1009 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM1 + SolutionIndex: 1132 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158744,7 +179128,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -158753,14 +179137,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -158768,29 +179152,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3328 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -158798,10 +179183,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158809,12 +179194,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -158825,6 +179210,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -158832,6 +179218,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158851,8 +179238,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158870,37 +179257,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1010 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM1 + SolutionIndex: 1133 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158908,59 +179295,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 2 LSPB: 32 - LVCA: 8 + LVCA: 128 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -158969,11 +179361,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -158985,13 +179379,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159011,8 +179407,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159030,37 +179426,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1011 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1134 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -159068,14 +179462,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -159092,25 +179486,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -159118,9 +179517,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -159129,12 +179528,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -159145,13 +179546,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159171,8 +179574,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159190,20 +179593,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1012 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1135 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -159211,16 +179614,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -159228,14 +179629,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -159252,29 +179653,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -159282,9 +179684,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -159293,12 +179695,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -159309,6 +179713,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -159316,6 +179721,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159335,8 +179741,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159354,20 +179760,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1013 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1136 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -159375,16 +179781,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -159392,15 +179796,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -159408,37 +179812,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -159446,9 +179851,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -159457,12 +179862,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -159473,13 +179880,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159499,8 +179908,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159518,37 +179927,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1014 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1137 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -159556,15 +179963,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -159572,48 +179979,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -159621,12 +180029,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -159637,6 +180047,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -159644,6 +180055,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159663,8 +180075,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159682,37 +180094,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1015 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1138 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -159720,16 +180130,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -159744,29 +180154,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -159774,9 +180185,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -159785,8 +180196,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -159801,6 +180214,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -159808,6 +180222,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159827,8 +180242,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159846,16 +180261,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1016 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 1139 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -159867,16 +180282,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -159891,7 +180304,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -159908,39 +180321,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -159949,11 +180363,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -159965,6 +180381,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -159972,6 +180389,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159991,8 +180409,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160010,16 +180428,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1017 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM8 + SolutionIndex: 1140 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -160031,16 +180449,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -160048,7 +180464,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160056,45 +180472,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 16 + LVCA: 64 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -160102,9 +180519,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -160113,11 +180530,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -160129,13 +180546,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160155,8 +180574,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160174,16 +180593,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1018 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1141 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -160194,17 +180613,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -160212,7 +180631,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160220,55 +180639,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -160277,12 +180697,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -160293,6 +180713,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -160300,6 +180721,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160319,8 +180741,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160338,37 +180760,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1019 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 1142 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -160376,7 +180798,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160385,7 +180807,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -160400,39 +180822,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -160441,12 +180864,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -160457,13 +180880,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160483,8 +180908,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160502,20 +180927,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1020 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1143 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -160523,16 +180948,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -160540,7 +180965,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160549,7 +180974,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -160564,19 +180989,20 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -160586,18 +181012,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160605,8 +181031,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -160621,13 +181047,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160647,8 +181075,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160666,20 +181094,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1021 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 + SolutionIndex: 1144 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -160687,16 +181115,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -160704,7 +181132,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160712,7 +181140,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -160720,48 +181148,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160769,8 +181198,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -160785,6 +181214,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -160792,6 +181222,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160811,8 +181242,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160830,37 +181261,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1022 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1145 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -160868,15 +181299,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -160884,44 +181315,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160929,8 +181365,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -160945,13 +181381,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160971,8 +181409,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160990,37 +181428,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1023 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1146 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161036,56 +181474,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161093,11 +181532,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -161109,6 +181548,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -161116,6 +181556,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161135,8 +181576,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -161154,29 +181595,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1024 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1147 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -161184,7 +181625,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161192,53 +181633,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -161246,10 +181688,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161257,12 +181699,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -161273,6 +181717,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -161280,6 +181725,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161299,8 +181745,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -161318,37 +181764,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1025 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1148 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161356,14 +181800,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -161380,29 +181824,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -161410,10 +181855,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161421,8 +181866,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -161437,13 +181884,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161463,8 +181912,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -161482,37 +181931,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1026 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1149 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161520,14 +181967,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -161544,29 +181991,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -161574,10 +182022,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161585,8 +182033,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -161601,6 +182051,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -161608,6 +182059,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161627,8 +182079,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -161646,37 +182098,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1027 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1150 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161684,16 +182134,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -161708,29 +182158,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -161738,10 +182189,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161749,8 +182200,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -161765,13 +182218,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161791,8 +182246,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -161810,37 +182265,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1028 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 + SolutionIndex: 1151 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161848,16 +182301,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -161872,29 +182325,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -161902,10 +182356,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161913,8 +182367,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -161929,13 +182385,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161955,8 +182413,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -161974,37 +182432,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1029 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1152 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162012,16 +182468,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -162036,29 +182492,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162066,10 +182523,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162077,8 +182534,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -162093,6 +182552,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -162100,6 +182560,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162119,8 +182580,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -162138,37 +182599,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1030 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1153 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162176,14 +182635,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -162200,29 +182659,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162230,10 +182690,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162241,8 +182701,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -162257,6 +182719,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -162264,6 +182727,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162283,8 +182747,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -162302,20 +182766,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1031 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + SolutionIndex: 1154 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -162323,16 +182787,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162340,14 +182802,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -162364,29 +182826,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162394,10 +182857,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162405,12 +182868,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -162421,6 +182886,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -162428,6 +182894,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162447,8 +182914,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -162466,37 +182933,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1032 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 + SolutionIndex: 1155 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162504,7 +182969,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -162513,7 +182978,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -162528,29 +182993,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCB: 8 + LSPA: 8 + LSPB: 128 LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162559,9 +183025,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162569,13 +183035,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -162585,13 +183051,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162611,8 +183079,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -162630,20 +183098,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1033 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG32_4_4_WGM1 + SolutionIndex: 1156 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -162651,16 +183119,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162668,7 +183136,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -162692,29 +183160,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6272 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162722,10 +183191,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162733,8 +183202,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -162749,6 +183218,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -162756,6 +183226,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162775,8 +183246,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -162794,37 +183265,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1034 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM1 + SolutionIndex: 1157 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162832,63 +183303,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCA: 32 + LSCB: 8 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -162897,11 +183365,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -162913,13 +183383,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162939,8 +183411,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -162958,37 +183430,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1035 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1158 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163002,50 +183472,55 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -163058,11 +183533,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -163073,13 +183550,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163099,8 +183578,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -163118,37 +183597,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1036 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR1_SNLL0_TT4_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1159 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163164,52 +183641,53 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6272 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -163222,11 +183700,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -163237,6 +183715,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -163244,6 +183723,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163263,8 +183743,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -163282,29 +183762,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1037 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1160 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -163312,7 +183792,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163328,56 +183808,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -163385,8 +183866,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -163401,6 +183882,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -163408,6 +183890,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163427,8 +183910,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -163446,28 +183929,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1038 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1161 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -163476,7 +183959,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163491,42 +183974,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -163538,9 +184022,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -163549,12 +184033,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -163565,6 +184051,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -163572,6 +184059,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163591,8 +184079,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -163610,8 +184098,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1039 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1162 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -163619,28 +184107,26 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163648,16 +184134,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -163668,33 +184154,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -163714,7 +184201,9 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -163729,6 +184218,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -163736,6 +184226,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163755,8 +184246,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -163774,37 +184265,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1040 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 + SolutionIndex: 1163 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163812,15 +184301,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -163828,29 +184317,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -163861,7 +184355,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -163874,11 +184368,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -163889,13 +184385,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163915,8 +184413,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -163934,8 +184432,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1041 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM1 + SolutionIndex: 1164 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -163954,17 +184452,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163978,39 +184474,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -164034,11 +184535,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -164049,13 +184552,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164075,8 +184580,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -164094,8 +184599,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1042 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM8 + SolutionIndex: 1165 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -164114,17 +184619,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164149,46 +184652,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -164197,11 +184701,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -164213,6 +184717,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -164220,6 +184725,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164239,8 +184745,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -164258,14 +184764,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1043 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1166 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -164279,7 +184785,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -164288,7 +184794,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164296,7 +184802,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -164304,7 +184810,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -164312,37 +184818,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -164350,9 +184857,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -164361,12 +184868,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -164377,6 +184884,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -164384,6 +184892,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164403,8 +184912,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -164422,14 +184931,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1044 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1167 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -164442,17 +184951,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164460,45 +184969,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 - LVCB: 4 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -164509,10 +185023,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -164521,11 +185035,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -164537,13 +185051,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164563,8 +185079,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -164582,37 +185098,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1045 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1168 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164620,49 +185136,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -164670,9 +185191,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -164681,8 +185202,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -164697,13 +185220,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164723,8 +185248,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -164742,14 +185267,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1046 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1169 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] @@ -164758,21 +185283,19 @@ ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164780,59 +185303,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -164841,12 +185369,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -164857,13 +185387,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164883,8 +185415,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -164902,37 +185434,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1047 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1170 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164940,59 +185470,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -165001,12 +185536,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -165017,13 +185554,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165043,8 +185582,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -165062,37 +185601,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1048 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1171 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165100,13 +185637,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -165124,25 +185661,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 528 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -165150,9 +185692,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -165161,12 +185703,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -165177,13 +185719,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165203,8 +185747,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -165222,20 +185766,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1049 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1172 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -165243,16 +185787,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165260,23 +185804,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -165284,25 +185828,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCB: 32 + LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -165337,13 +185886,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165363,8 +185914,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -165382,20 +185933,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1050 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1173 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -165403,16 +185954,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165420,23 +185971,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -165444,25 +185995,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCB: 32 + LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -165471,9 +186027,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 16 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 16 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -165481,12 +186037,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -165497,13 +186055,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165523,8 +186083,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -165542,37 +186102,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1051 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1174 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR0_TT4_4_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165580,16 +186138,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -165600,33 +186158,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 + LVCB: 32 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3104 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -165635,9 +186194,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -165645,11 +186204,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -165661,6 +186222,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -165668,6 +186230,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165687,8 +186250,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -165706,20 +186269,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1052 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1175 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -165727,16 +186290,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165744,60 +186305,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -165805,11 +186371,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -165821,13 +186389,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165847,8 +186417,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -165866,15 +186436,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1053 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1176 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO1_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [2, 4] ThreadTile0: 2 @@ -165882,21 +186452,19 @@ ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165910,37 +186478,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -165954,9 +186527,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -165965,12 +186538,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -165981,13 +186556,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166007,8 +186584,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -166026,37 +186603,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1054 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1177 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166071,7 +186646,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -166084,27 +186659,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -166118,9 +186694,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -166129,11 +186705,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -166145,13 +186723,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166171,8 +186751,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -166190,37 +186770,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1055 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1178 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166228,49 +186806,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -166281,10 +186860,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -166293,11 +186872,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -166309,6 +186890,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -166316,6 +186898,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166335,8 +186918,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -166354,37 +186937,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1056 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1179 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166392,53 +186973,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -166446,10 +187024,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -166457,13 +187035,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166473,13 +187054,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166499,8 +187082,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -166518,37 +187101,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1057 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM16 + SolutionIndex: 1180 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166556,53 +187137,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -166610,10 +187188,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -166621,13 +187199,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166637,13 +187218,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166663,8 +187246,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -166682,37 +187265,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1058 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM16 + SolutionIndex: 1181 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166720,16 +187301,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -166744,40 +187325,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -166785,13 +187367,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166801,6 +187386,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -166808,6 +187394,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166827,8 +187414,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -166846,16 +187433,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1059 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM1 + SolutionIndex: 1182 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -166867,16 +187454,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166884,16 +187469,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -166908,29 +187493,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 32 - LVCB: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -166938,10 +187524,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -166949,13 +187535,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166965,6 +187554,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -166972,6 +187562,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166991,8 +187582,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -167010,16 +187601,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1060 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM1 + SolutionIndex: 1183 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -167031,16 +187622,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167048,7 +187637,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -167056,7 +187645,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -167064,37 +187653,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -167103,9 +187693,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -167113,13 +187703,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167129,6 +187720,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -167136,6 +187728,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167155,8 +187748,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -167174,16 +187767,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1061 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM8 + SolutionIndex: 1184 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -167194,17 +187787,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167212,7 +187805,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -167221,7 +187814,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -167236,40 +187829,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -167277,13 +187871,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167293,6 +187888,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -167300,6 +187896,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167319,8 +187916,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -167338,16 +187935,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1062 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM16 + SolutionIndex: 1185 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -167359,16 +187956,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167376,15 +187973,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -167392,37 +187989,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -167430,10 +188028,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -167441,13 +188039,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167457,6 +188058,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -167464,6 +188066,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167483,8 +188086,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -167502,33 +188105,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1063 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM16 + SolutionIndex: 1186 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -167540,48 +188141,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 96 + LSCB: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 + LVCA: 48 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -167594,10 +188195,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -167606,15 +188207,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167652,8 +188252,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -167671,8 +188271,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1064 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1187 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -167681,21 +188281,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -167707,44 +188309,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 16 + LVCA: 64 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -167761,10 +188363,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -167773,13 +188375,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167817,8 +188422,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -167836,8 +188441,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1065 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1188 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -167846,23 +188451,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -167881,8 +188484,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -167890,32 +188493,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 128 LSCB: 8 - LSPA: 5 + LSPA: 8 LSPB: 64 - LVCA: 48 + LVCA: 32 LVCB: 4 - LVPA: 3 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -167929,9 +188532,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -167940,13 +188543,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167958,7 +188564,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -167984,8 +188590,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -168003,8 +188609,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1066 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1189 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168013,23 +188619,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -168048,8 +188652,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -168057,32 +188661,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 128 LSCB: 8 - LSPA: 5 + LSPA: 8 LSPB: 64 - LVCA: 48 + LVCA: 32 LVCB: 4 - LVPA: 3 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -168096,9 +188700,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168107,13 +188711,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168125,7 +188732,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -168151,8 +188758,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -168170,8 +188777,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1067 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1190 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168180,23 +188787,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -168208,48 +188813,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 + LSCA: 128 + LSCB: 16 + LSPA: 8 LSPB: 64 - LVCA: 48 + LVCA: 32 LVCB: 4 - LVPA: 3 - LVPB: 32 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -168262,10 +188867,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 96 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168274,13 +188879,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168318,8 +188926,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -168337,8 +188945,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1068 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1191 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168347,23 +188955,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -168383,7 +188989,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -168391,32 +188997,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 128 LSCB: 8 - LSPA: 5 + LSPA: 8 LSPB: 64 - LVCA: 48 + LVCA: 32 LVCB: 4 - LVPA: 3 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -168430,9 +189036,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168441,13 +189047,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168485,8 +189092,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -168504,8 +189111,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1069 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1192 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168514,22 +189121,22 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -168542,15 +189149,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -168558,32 +189165,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6656 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -168596,11 +189203,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -168608,15 +189215,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168654,8 +189260,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -168673,8 +189279,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1070 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1193 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168683,21 +189289,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -168715,42 +189323,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 96 + LSPB: 32 LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -168764,10 +189368,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -168775,13 +189379,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168792,8 +189399,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -168819,8 +189426,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -168838,8 +189445,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1071 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1194 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168848,13 +189455,13 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -168863,8 +189470,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -168882,7 +189487,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -168902,24 +189507,20 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 2 + LSPA: 4 LSPB: 32 - LVCA: 128 + LVCA: 64 LVCB: 8 - LVPA: 2 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 784 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -168931,10 +189532,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -168942,15 +189543,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168961,8 +189563,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -168988,8 +189590,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -169007,8 +189609,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1072 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_WG16_16_1_WGM8 + SolutionIndex: 1195 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR0_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -169017,11 +189619,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -169029,7 +189631,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -169043,23 +189645,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -169069,24 +189671,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 + LSCA: 32 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 64 + LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 32 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -169097,10 +189699,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169109,15 +189711,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169129,7 +189730,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -169155,8 +189756,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -169174,8 +189775,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1073 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1196 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -169184,10 +189785,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -169198,7 +189799,9 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -169210,50 +189813,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 + LSCA: 64 + LSCB: 16 LSPA: 4 - LSPB: 64 + LSPB: 16 LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -169264,10 +189863,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169276,15 +189875,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169295,7 +189895,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -169322,8 +189922,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -169341,8 +189941,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1074 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1197 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -169351,21 +189951,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -169377,50 +189977,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -169431,10 +190027,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169443,15 +190039,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169462,8 +190059,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -169489,8 +190086,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -169508,8 +190105,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1075 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1198 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -169518,21 +190115,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -169544,16 +190141,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -169570,24 +190167,20 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -169598,10 +190191,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169610,15 +190203,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169629,7 +190223,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -169656,8 +190250,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -169675,8 +190269,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1076 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1199 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -169685,10 +190279,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -169697,9 +190291,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -169711,16 +190305,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -169737,24 +190331,20 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -169765,10 +190355,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169777,15 +190367,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169796,7 +190385,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -169823,8 +190412,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -169842,8 +190431,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1077 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1200 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -169852,10 +190441,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -169864,9 +190453,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -169884,59 +190475,55 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 LSPA: 8 - LSPB: 64 + LSPB: 16 LVCA: 32 - LVCB: 4 - LVPA: 2 + LVCB: 16 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -169944,8 +190531,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -169953,6 +190540,7 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169963,7 +190551,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -169990,8 +190578,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -170009,29 +190597,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1078 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1201 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -170045,15 +190633,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -170061,38 +190649,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170100,10 +190688,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170111,13 +190699,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170129,7 +190720,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -170155,8 +190746,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -170174,33 +190765,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1079 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1202 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -170212,7 +190801,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -170220,7 +190809,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -170228,38 +190817,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170267,10 +190856,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170278,13 +190867,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170322,8 +190912,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -170341,31 +190931,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1080 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1203 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -170379,16 +190969,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -170399,34 +190989,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170434,10 +191024,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170445,13 +191035,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170463,7 +191056,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -170489,8 +191082,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -170508,33 +191101,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1081 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1204 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW4_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -170546,54 +191137,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 + LSCA: 32 + LSCB: 32 LSPA: 8 - LSPB: 64 + LSPB: 8 LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3392 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170601,10 +191192,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170612,13 +191203,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170630,7 +191224,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -170656,8 +191250,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -170675,33 +191269,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1082 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1205 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -170713,7 +191305,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -170721,7 +191313,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -170729,38 +191321,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170768,10 +191360,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170779,13 +191371,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170823,8 +191416,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -170842,31 +191435,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1083 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1206 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -170880,14 +191473,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -170906,28 +191499,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170935,10 +191528,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170946,13 +191539,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170990,8 +191586,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -171009,20 +191605,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1084 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1207 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -171030,16 +191626,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171047,48 +191641,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -171101,10 +191691,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -171113,13 +191703,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171130,8 +191723,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -171157,8 +191750,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -171176,8 +191769,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1085 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1208 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -171186,23 +191779,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -171221,41 +191812,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 - LVPA: 2 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -171269,10 +191860,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -171280,15 +191871,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171326,8 +191916,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -171345,8 +191935,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1086 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_WG16_16_1_WGM8 + SolutionIndex: 1209 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -171355,25 +191945,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171389,36 +191981,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -171437,9 +192029,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -171447,15 +192039,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171467,7 +192060,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -171493,8 +192086,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -171512,8 +192105,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1087 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1210 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -171522,17 +192115,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -171583,9 +192176,9 @@ LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -171604,9 +192197,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -171614,15 +192207,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171660,8 +192254,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -171679,8 +192273,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1088 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1211 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -171689,11 +192283,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -171701,13 +192295,13 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171721,42 +192315,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 32 LVCA: 32 - LVCB: 2 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -171770,10 +192360,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -171781,8 +192371,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -171790,6 +192380,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171800,7 +192391,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -171827,8 +192418,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -171846,8 +192437,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1089 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1212 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -171856,25 +192447,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171890,42 +192481,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 32 LVCA: 32 - LVCB: 2 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1544 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -171937,10 +192528,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -171948,8 +192539,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -171957,6 +192548,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171994,8 +192586,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -172013,8 +192605,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1090 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1213 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB1_PGR1_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172023,25 +192615,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172055,44 +192647,40 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 32 LVCA: 32 - LVCB: 2 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 520 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -172104,10 +192692,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -172115,8 +192703,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -172124,6 +192712,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172134,8 +192723,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -172161,8 +192750,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -172180,8 +192769,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1091 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1214 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172190,25 +192779,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172216,50 +192805,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 + LSCA: 32 + LSCB: 16 LSPA: 8 - LSPB: 128 + LSPB: 16 LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1040 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -172270,11 +192855,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -172282,15 +192867,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172301,8 +192887,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -172328,8 +192914,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -172347,8 +192933,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1092 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1215 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172357,25 +192943,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172383,7 +192969,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -172391,40 +192977,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 32 + LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -172437,11 +193023,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -172449,15 +193035,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172469,7 +193056,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -172495,8 +193082,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -172514,8 +193101,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1093 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1216 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR0_TT2_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172524,21 +193111,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -172557,58 +193144,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 32 LVCA: 32 - LVCB: 2 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -172616,13 +193203,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172634,7 +193224,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -172660,8 +193250,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -172679,37 +193269,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1094 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1217 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172717,65 +193305,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -172783,13 +193367,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172800,7 +193387,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -172827,8 +193414,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -172846,37 +193433,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1095 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1218 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_AMAS1_EPS0_FL0_GRVW1_GSU8_LPB1_PGR0_PLR1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172884,7 +193469,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -172893,33 +193478,33 @@ ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCA: 16 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 32 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 2048 LdsOffsetA: 0 LdsOffsetB: 256 LdsPadA: 0 @@ -172927,17 +193512,17 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -172946,7 +193531,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 @@ -172955,6 +193540,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172966,7 +193552,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -172992,8 +193578,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -173011,31 +193597,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1096 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_WG16_16_1_WGM1 + SolutionIndex: 1219 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_AMAS1_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 1 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -173054,8 +193640,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -173063,49 +193649,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -173114,14 +193700,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173159,8 +193744,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -173178,31 +193763,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1097 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 + SolutionIndex: 1220 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -173214,54 +193801,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -173269,24 +193856,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173324,9 +193916,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -173343,33 +193936,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1098 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 + SolutionIndex: 1221 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -173381,54 +193972,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -173436,24 +194027,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173491,9 +194087,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -173510,33 +194107,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1099 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1222 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -173556,66 +194151,68 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -173623,6 +194220,7 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173660,9 +194258,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -173679,29 +194278,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1100 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM1 + SolutionIndex: 1223 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -173722,9 +194321,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -173735,61 +194334,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3104 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173827,9 +194427,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -173846,31 +194447,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1101 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1224 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -173889,9 +194492,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -173911,52 +194514,53 @@ LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173994,9 +194598,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -174013,15 +194618,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1102 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1225 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -174034,10 +194639,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -174049,15 +194656,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -174069,61 +194676,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 8 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174135,7 +194743,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -174161,9 +194769,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -174180,31 +194789,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1103 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1226 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -174216,7 +194827,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -174225,14 +194836,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -174243,52 +194854,55 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 8 LSPA: 8 - LSPB: 16 + LSPB: 96 LVCA: 32 - LVCB: 16 + LVCB: 2 LVPA: 4 - LVPB: 8 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174326,9 +194940,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -174345,31 +194960,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1104 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1227 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -174383,7 +194998,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -174391,8 +195006,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -174403,34 +195018,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 16 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -174439,23 +195054,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174493,9 +195111,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -174512,31 +195131,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1105 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1228 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -174550,16 +195169,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -174576,53 +195195,58 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174634,7 +195258,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -174660,9 +195284,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -174679,20 +195304,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1106 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1229 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -174700,12 +195325,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -174717,7 +195340,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -174725,8 +195348,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -174737,34 +195360,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 32 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -174772,19 +195395,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -174792,6 +195417,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174803,7 +195429,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -174829,9 +195455,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -174848,31 +195475,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1107 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_WG4_16_4_WGM1 + SolutionIndex: 1230 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -174884,7 +195511,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -174892,46 +195519,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -174939,26 +195566,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174996,9 +195626,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -175015,31 +195646,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1108 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM1 + SolutionIndex: 1231 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -175051,7 +195682,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -175059,46 +195690,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -175106,26 +195737,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175163,9 +195797,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -175182,31 +195817,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1109 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM8 + SolutionIndex: 1232 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -175218,79 +195853,84 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175328,9 +195968,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -175347,33 +195988,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1110 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1233 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -175385,7 +196024,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -175394,14 +196033,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -175411,28 +196050,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -175440,24 +196079,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175495,9 +196137,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -175514,31 +196157,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1111 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1234 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -175552,16 +196195,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -175572,34 +196215,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -175607,26 +196250,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175664,9 +196308,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -175683,31 +196328,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1112 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR0_TT4_4_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1235 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -175719,54 +196366,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -175774,26 +196421,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175805,7 +196453,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -175831,9 +196479,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -175850,31 +196499,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1113 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1236 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -175886,54 +196537,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 64 LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -175941,26 +196592,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175972,7 +196624,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -175998,9 +196650,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -176017,31 +196670,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1114 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO1_WG16_4_4_WGM8 + SolutionIndex: 1237 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -176053,54 +196708,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 64 LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -176108,26 +196763,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176165,9 +196821,183 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1238 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -176184,8 +197014,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1115 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_WG16_4_4_WGM8 + SolutionIndex: 1239 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -176194,21 +197024,21 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -176228,7 +197058,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -176236,34 +197066,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -176275,18 +197105,20 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 NumLoadsB: 1 @@ -176295,6 +197127,7 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176306,7 +197139,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -176332,9 +197165,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -176351,8 +197185,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1116 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1240 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -176361,17 +197195,17 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -176387,16 +197221,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -176414,21 +197248,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -176441,7 +197275,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 16 MacroTileA: 64 @@ -176449,19 +197283,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176499,9 +197334,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -176518,8 +197354,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1117 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1241 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -176542,7 +197378,9 @@ WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [704, 1024, 1, 128] - [102, 3019.46] @@ -179022,8 +199860,6 @@ - [212, 8995.84] - - [4096, 512, 1, 2048] - [207, 9298.08] - - - [512, 256, 1, 2048] - - [200, 5186.16] - - [4096, 1024, 1, 2048] - [189, 9790.67] - - [2048, 1024, 1, 2048] @@ -182512,5972 +203348,6502 @@ - [524, 10427.3] - - [1024, 1, 1, 13] - [537, 0.0] + - - [768, 512, 1, 768] + - [561, 5889.04] + - - [768, 2048, 1, 3072] + - [571, 9394.62] + - - [768, 32, 1, 768] + - [583, 1502.74] + - - [64, 128, 96, 128] + - [578, 4973.48] + - - [3072, 1024, 1, 768] + - [572, 9856.07] + - - [768, 1024, 1, 3072] + - [565, 8611.06] + - - [768, 512, 1, 3072] + - [564, 6430.79] + - - [768, 64, 1, 768] + - [585, 2621.44] + - - [768, 4096, 1, 3072] + - [570, 10030.4] + - - [768, 2048, 1, 2] + - [563, 381.763] + - - [768, 2048, 1, 768] + - [568, 9754.2] + - - [768, 320, 1, 30522] + - [581, 8529.4] + - - [64, 64, 96, 64] + - [575, 2496.61] + - - [768, 640, 1, 30522] + - [562, 8253.84] + - - [768, 1280, 1, 30522] + - [567, 9572.85] + - - [768, 1280, 1, 768] + - [571, 8713.93] + - - [768, 640, 1, 768] + - [561, 7293.03] + - - [768, 32, 1, 2] + - [573, 11.8154] + - - [3072, 2048, 1, 768] + - [568, 10019.6] + - - [768, 4096, 1, 768] + - [568, 9927.35] + - - [3072, 4096, 1, 768] + - [571, 10150.1] + - - [64, 256, 192, 256] + - [577, 7054.19] + - - [768, 8, 1, 768] + - [584, 340.939] + - - [64, 128, 384, 128] + - [576, 6765.01] + - - [768, 1024, 1, 768] + - [566, 8768.58] + - - [768, 320, 1, 768] + - [582, 6838.54] + - - [64, 64, 768, 64] + - [579, 5388.83] + - - [768, 1024, 1, 2] + - [559, 258.695] + - - [768, 16, 1, 768] + - [584, 819.2] + - - [64, 256, 96, 256] + - [577, 5893.64] + - - [3072, 512, 1, 768] + - [569, 9722.79] + - - [768, 160, 1, 768] + - [586, 5019.78] + - - [768, 4096, 1, 2] + - [560, 507.375] + - - [1600, 512, 1, 1024] + - [590, 7186.95] + - - [1024, 512, 1, 64] + - [588, 2557.5] + - - [1024, 512, 1, 1] + - [587, 71.2348] + - - [2048, 512, 1, 1] + - [589, 90.3945] + - - [1024, 200, 1, 1] + - [595, 40.0] + - - [32, 200, 1, 1] + - [591, 1.56863] + - - [560, 200, 1, 1024] + - [599, 4731.35] + - - [1, 512, 1, 1] + - [598, 0.130612] + - - [64, 512, 1, 1] + - [593, 7.58519] + - - [1024, 8192, 1, 256] + - [608, 9518.99] + - - [1024, 22016, 1, 256] + - [614, 9881.12] + - - [256, 8976, 1, 4352] + - [606, 9567.08] + - - [512, 256, 1, 2048] + - [619, 5917.89] + - - [1024, 19968, 1, 256] + - [614, 9882.37] + - - [256, 8976, 1, 1536] + - [604, 8437.35] + - - [256, 8976, 1, 33536] + - [604, 8441.89] + - - [1024, 1792, 1, 256] + - [604, 7756.97] + - - [1024, 21504, 1, 256] + - [614, 9893.9] + - - [512, 215, 1, 2048] + - [620, 4665.64] + - - [1024, 7168, 1, 256] + - [608, 9509.35] + - - [256, 8976, 1, 15872] + - [610, 8914.65] + - - [1024, 19712, 1, 256] + - [614, 9771.9] + - - [256, 8976, 1, 5632] + - [610, 8740.03] + - - [1024, 14848, 1, 256] + - [614, 9756.15] + - - [1024, 28672, 1, 256] + - [614, 9958.92] + - - [256, 8976, 1, 9728] + - [617, 8853.04] + - - [1024, 17152, 1, 256] + - [608, 9737.3] + - - [256, 8976, 1, 11520] + - [610, 8999.2] + - - [256, 8976, 1, 8192] + - [600, 7897.32] + - - [1024, 3328, 1, 256] + - [615, 8593.53] + - - [256, 8976, 1, 7424] + - [610, 8980.47] + - - [1024, 18944, 1, 256] + - [614, 9854.85] + - - [1024, 10496, 1, 256] + - [609, 9453.9] + - - [256, 8976, 1, 5376] + - [607, 9608.37] + - - [256, 8976, 1, 6144] + - [604, 7880.13] + - - [1024, 40448, 1, 256] + - [614, 10016.6] + - - [256, 8976, 1, 22016] + - [617, 8939.87] + - - [256, 8976, 1, 4864] + - [605, 9211.43] + - - [256, 8976, 1, 12288] + - [601, 8065.05] + - - [1024, 9728, 1, 256] + - [614, 9636.25] + - - [256, 8976, 1, 2048] + - [602, 7001.33] + - - [1024, 10240, 1, 256] + - [608, 9619.96] + - - [256, 8976, 1, 2304] + - [606, 9509.74] + - - [1024, 7936, 1, 256] + - [614, 9300.67] + - - [768, 256, 1, 2048] + - [618, 6267.95] + - - [1024, 9984, 1, 256] + - [614, 9477.28] + - - [1024, 13312, 1, 256] + - [614, 9758.56] + - - [1024, 16128, 1, 256] + - [608, 9721.9] + - - [1024, 8960, 1, 256] + - [609, 9398.25] + - - [1024, 5120, 1, 256] + - [615, 9315.5] + - - [1024, 11264, 1, 256] + - [608, 9664.8] + - - [256, 8976, 1, 20480] + - [616, 8279.87] + - - [1024, 20992, 1, 256] + - [608, 9878.87] + - - [256, 8976, 1, 9472] + - [610, 8990.96] + - - [256, 8976, 1, 8448] + - [610, 8983.52] + - - [256, 8976, 1, 20992] + - [611, 8942.11] + - - [256, 8976, 1, 10496] + - [611, 8989.71] + - - [1024, 15104, 1, 256] + - [609, 9676.01] + - - [1024, 6400, 1, 256] + - [617, 9145.89] + - - [1024, 4096, 1, 256] + - [610, 9124.25] + - - [256, 8976, 1, 2560] + - [604, 8566.11] + - - [256, 8976, 1, 2816] + - [606, 9496.84] + - - [1024, 7680, 1, 256] + - [614, 9460.84] + - - [256, 8976, 1, 14336] + - [611, 8226.8] + - - [256, 8976, 1, 6656] + - [611, 8771.42] + - - [1024, 3072, 1, 256] + - [611, 9076.94] + - - [256, 8976, 1, 5888] + - [607, 9546.3] + - - [1024, 12288, 1, 256] + - [608, 9690.81] + - - [256, 8976, 1, 26112] + - [613, 8699.83] + - - [1024, 7424, 1, 256] + - [615, 9256.84] + - - [256, 8976, 1, 14848] + - [616, 8885.79] + - - [768, 215, 1, 2048] + - [618, 5628.59] + - - [1024, 2560, 1, 256] + - [611, 8820.83] + - - [256, 8976, 1, 19968] + - [610, 8928.86] + - - [256, 8976, 1, 9984] + - [610, 8993.12] + - - [1024, 4864, 1, 256] + - [611, 8974.3] + - - [1024, 33536, 1, 256] + - [614, 9943.07] + - - [256, 8976, 1, 15104] + - [611, 8996.63] + - - [1024, 2048, 1, 256] + - [609, 8462.66] + - - [256, 8976, 1, 8960] + - [611, 8998.92] + - - [1024, 6144, 1, 256] + - [616, 9359.67] + - - [1024, 14592, 1, 256] + - [614, 9667.42] + - - [256, 8976, 1, 19712] + - [610, 9020.11] + - - [1024, 11520, 1, 256] + - [609, 9527.7] + - - [1024, 5632, 1, 256] + - [608, 9297.2] + - - [256, 8976, 1, 11008] + - [617, 8994.8] + - - [256, 8976, 1, 17152] + - [611, 9003.8] + - - [256, 8976, 1, 3072] + - [600, 8261.96] + - - [1024, 3840, 1, 256] + - [617, 8671.89] + - - [1024, 14336, 1, 256] + - [614, 9760.28] + - - [1024, 20480, 1, 256] + - [608, 9887.85] + - - [1024, 23552, 1, 256] + - [608, 9890.46] + - - [256, 8976, 1, 7168] + - [603, 8478.34] + - - [1024, 13568, 1, 256] + - [608, 9654.64] + - - [1024, 4608, 1, 256] + - [616, 9218.25] + - - [256, 8976, 1, 10240] + - [601, 8076.16] + - - [1024, 8704, 1, 256] + - [610, 9475.5] + - - [1024, 11008, 1, 256] + - [614, 9524.96] + - - [1024, 8448, 1, 256] + - [608, 9352.16] + - - [256, 8976, 1, 44505] + - [612, 8430.23] - - [704, 1024, 1, 128] - - [661, 3019.56] + - [723, 3019.56] - - [1024, 1024, 1, 3328] - - [699, 8162.65] + - [761, 8162.65] - - [4, 704, 1, 1280] - - [602, 319.646] + - [664, 319.646] - - [4, 1856, 1, 3328] - - [632, 550.614] + - [694, 550.614] - - [1856, 448, 1, 3328] - - [684, 6813.15] + - [746, 6813.15] - - [2944, 4288, 1, 1280] - - [693, 8975.86] + - [755, 8975.86] - - [2368, 64, 1, 3328] - - [607, 5482.33] + - [669, 5482.33] - - [1760, 32, 1, 1760] - - [646, 3860.04] + - [708, 3860.04] - - [2368, 5888, 1, 256] - - [690, 8656.83] + - [752, 8656.83] - - [5888, 1856, 1, 256] - - [680, 7881.53] + - [742, 7881.53] - - [64, 3584, 1, 1280] - - [616, 4835.43] + - [678, 4835.43] - - [512, 24000, 1, 1536] - - [687, 8666.0] + - [749, 8666.0] - - [128, 6784, 1, 3328] - - [684, 7062.35] + - [746, 7062.35] - - [5888, 1408, 1, 256] - - [697, 8130.32] + - [759, 8130.32] - - [5888, 1856, 1, 3328] - - [687, 8840.85] + - [749, 8840.85] - - [512, 4, 1, 512] - - [572, 170.323] + - [634, 170.323] - - [35, 1500, 1, 2560] - - [576, 2896.65] + - [638, 2896.65] - - [1856, 4288, 1, 256] - - [676, 8374.73] + - [738, 8374.73] - - [1024, 5056, 1, 128] - - [673, 3304.35] + - [735, 3304.35] - - [5056, 5056, 1, 3328] - - [687, 8905.53] + - [749, 8905.53] - - [1408, 5888, 1, 1280] - - [687, 9418.2] + - [749, 9418.2] - - [2368, 448, 1, 128] - - [661, 3075.07] + - [723, 3075.07] - - [6144, 6000, 1, 2560] - - [687, 9336.43] + - [749, 9336.43] - - [2368, 6784, 1, 128] - - [660, 4919.36] + - [722, 4919.36] - - [1024, 3584, 1, 3328] - - [678, 8071.17] + - [740, 8071.17] - - [512, 48000, 1, 2048] - - [687, 8763.16] + - [749, 8763.16] - - [1408, 64, 1, 128] - - [583, 805.57] + - [645, 805.57] - - [256, 4288, 1, 3328] - - [709, 6331.96] + - [771, 6331.96] - - [5888, 1408, 1, 1280] - - [677, 9226.27] + - [739, 9226.27] - - [704, 1856, 1, 3328] - - [703, 6309.5] + - [765, 6309.5] - - [1408, 4288, 1, 256] - - [687, 8374.6] + - [749, 8374.6] - - [1024, 2368, 1, 256] - - [684, 7341.12] + - [746, 7341.12] - - [64, 4, 1, 256] - - [627, 13.1032] + - [689, 13.1032] - - [1408, 1856, 1, 1280] - - [694, 8773.05] + - [756, 8773.05] - - [1408, 64, 1, 1280] - - [640, 4050.08] + - [702, 4050.08] - - [448, 1024, 1, 1280] - - [703, 6071.26] + - [765, 6071.26] - - [4096, 32, 1, 4096] - - [637, 5491.82] + - [699, 5491.82] - - [256, 1408, 1, 3328] - - [689, 5351.49] + - [751, 5351.49] - - [5056, 5056, 1, 1280] - - [697, 9408.67] + - [759, 9408.67] - - [448, 5056, 1, 256] - - [702, 6680.54] + - [764, 6680.54] - - [704, 1856, 1, 1280] - - [679, 7504.03] + - [741, 7504.03] - - [128, 5056, 1, 128] - - [594, 2316.58] + - [656, 2316.58] - - [2368, 128, 1, 256] - - [679, 3660.22] + - [741, 3660.22] - - [1856, 1408, 1, 128] - - [666, 3885.97] + - [728, 3885.97] - - [64, 5056, 1, 256] - - [689, 3318.91] + - [751, 3318.91] - - [6784, 256, 1, 3328] - - [687, 7590.64] + - [749, 7590.64] - - [1408, 3584, 1, 256] - - [676, 8276.92] + - [738, 8276.92] - - [4288, 448, 1, 256] - - [689, 7139.79] + - [751, 7139.79] - - [64, 704, 1, 128] - - [590, 375.567] + - [652, 375.567] - - [1024, 1856, 1, 128] - - [659, 2890.66] + - [721, 2890.66] - - [4288, 2944, 1, 1280] - - [693, 8981.45] + - [755, 8981.45] - - [704, 5056, 1, 1280] - - [679, 7684.72] + - [741, 7684.72] - - [2368, 704, 1, 3328] - - [694, 7070.14] + - [756, 7070.14] - - [256, 5888, 1, 256] - - [679, 7319.45] + - [741, 7319.45] - - [1856, 4288, 1, 3328] - - [677, 9238.69] + - [739, 9238.69] - - [256, 2944, 1, 256] - - [679, 6090.31] + - [741, 6090.31] - - [5888, 1024, 1, 256] - - [683, 8270.05] + - [745, 8270.05] - - [448, 64, 1, 1280] - - [636, 2493.32] + - [698, 2493.32] - - [3072, 64, 1, 1024] - - [619, 3149.77] + - [681, 3149.77] - - [3584, 4, 1, 1280] - - [721, 567.862] + - [783, 567.862] - - [2560, 16, 1, 2560] - - [628, 2887.15] + - [690, 2887.15] - - [2944, 64, 1, 256] - - [619, 2565.76] + - [681, 2565.76] - - [128, 4, 1, 1280] - - [722, 78.8692] + - [784, 78.8692] - - [1408, 2944, 1, 256] - - [683, 8337.3] + - [745, 8337.3] - - [256, 1856, 1, 1280] - - [709, 6267.35] + - [771, 6267.35] - - [6784, 5056, 1, 3328] - - [693, 9424.0] + - [755, 9424.0] - - [5056, 5056, 1, 256] - - [680, 8758.33] + - [742, 8758.33] - - [128, 256, 1, 256] - - [635, 1205.36] + - [697, 1205.36] - - [64, 1024, 1, 1280] - - [646, 3566.68] + - [708, 3566.68] - - [2944, 4, 1, 256] - - [599, 319.449] + - [661, 319.449] - - [704, 5056, 1, 128] - - [668, 4073.83] + - [730, 4073.83] - - [4, 2368, 1, 1280] - - [627, 496.992] + - [689, 496.992] - - [2368, 2944, 1, 1280] - - [676, 9085.55] + - [738, 9085.55] - - [448, 448, 1, 3328] - - [654, 5428.76] + - [716, 5428.76] - - [6784, 6784, 1, 1280] - - [693, 8727.03] + - [755, 8727.03] - - [1024, 256, 1, 3328] - - [703, 5499.42] + - [765, 5499.42] - - [1408, 4288, 1, 1280] - - [677, 9094.42] + - [739, 9094.42] - - [3584, 4288, 1, 1280] - - [680, 8703.88] + - [742, 8703.88] - - [512, 6000, 1, 2560] - - [683, 8474.56] + - [745, 8474.56] - - [2368, 704, 1, 1280] - - [689, 7651.59] + - [751, 7651.59] - - [5056, 4288, 1, 3328] - - [697, 8545.35] + - [759, 8545.35] - - [3584, 2368, 1, 3328] - - [685, 8797.88] + - [747, 8797.88] - - [5888, 6784, 1, 1280] - - [683, 8785.18] + - [745, 8785.18] - - [64, 704, 1, 1280] - - [606, 2783.48] + - [668, 2783.48] - - [4288, 256, 1, 256] - - [679, 6162.78] + - [741, 6162.78] - - [2944, 128, 1, 128] - - [581, 1951.33] + - [643, 1951.33] - - [6144, 32, 1, 2560] - - [640, 4589.05] + - [702, 4589.05] - - [6784, 448, 1, 1280] - - [684, 8674.31] + - [746, 8674.31] - - [2944, 5888, 1, 256] - - [697, 8991.76] + - [759, 8991.76] - - [64, 64, 1, 1280] - - [657, 712.448] + - [719, 712.448] - - [4288, 2944, 1, 256] - - [693, 8678.14] + - [755, 8678.14] - - [5888, 704, 1, 1280] - - [683, 8652.71] + - [745, 8652.71] - - [5056, 4, 1, 3328] - - [599, 650.772] + - [661, 650.772] - - [1856, 64, 1, 1280] - - [616, 4471.97] + - [678, 4471.97] - - [1760, 16, 1, 1760] - - [656, 2592.23] + - [718, 2592.23] - - [448, 5888, 1, 128] - - [666, 3823.03] + - [728, 3823.03] - - [5888, 64, 1, 3328] - - [648, 6013.22] + - [710, 6013.22] - - [2944, 256, 1, 3328] - - [689, 7791.45] + - [751, 7791.45] - - [1024, 64, 1, 128] - - [590, 592.516] + - [652, 592.516] - - [5056, 2368, 1, 1280] - - [676, 9260.53] + - [738, 9260.53] - - [448, 3584, 1, 1280] - - [697, 6771.34] + - [759, 6771.34] - - [6784, 5888, 1, 256] - - [691, 7933.39] + - [753, 7933.39] - - [64, 1024, 1, 3328] - - [640, 4783.08] + - [702, 4783.08] - - [704, 128, 1, 1280] - - [646, 3971.98] + - [708, 3971.98] - - [4, 3584, 1, 128] - - [715, 59.5238] + - [777, 59.5238] - - [1408, 448, 1, 1280] - - [689, 5902.17] + - [751, 5902.17] - - [1024, 1408, 1, 256] - - [684, 5272.94] + - [746, 5272.94] - - [2368, 2368, 1, 3328] - - [689, 8488.76] + - [751, 8488.76] - - [1856, 6784, 1, 128] - - [666, 4742.51] + - [728, 4742.51] - - [5056, 704, 1, 3328] - - [692, 7772.48] + - [754, 7772.48] - - [1408, 1856, 1, 256] - - [710, 5229.84] + - [772, 5229.84] - - [1408, 704, 1, 3328] - - [710, 6954.93] + - [772, 6954.93] - - [2368, 5056, 1, 256] - - [683, 8580.68] + - [745, 8580.68] - - [1408, 256, 1, 1280] - - [709, 4790.11] + - [771, 4790.11] - - [3072, 128, 1, 1024] - - [705, 4579.87] + - [767, 4579.87] - - [3584, 2368, 1, 1280] - - [676, 8675.13] + - [738, 8675.13] - - [4288, 64, 1, 3328] - - [655, 5550.11] + - [717, 5550.11] - - [2368, 4, 1, 1280] - - [721, 537.518] + - [783, 537.518] - - [704, 5888, 1, 256] - - [677, 5305.88] + - [739, 5305.88] - - [6784, 2944, 1, 128] - - [673, 4344.21] + - [735, 4344.21] - - [6784, 64, 1, 256] - - [703, 4496.42] + - [765, 4496.42] - - [2944, 256, 1, 256] - - [689, 6553.7] + - [751, 6553.7] - - [2944, 6784, 1, 3328] - - [677, 8895.76] + - [739, 8895.76] - - [128, 1, 1, 1408] - - [657, 25.7] + - [719, 25.7] - - [704, 1408, 1, 3328] - - [691, 7913.21] + - [753, 7913.21] - - [3584, 704, 1, 3328] - - [676, 7526.43] + - [738, 7526.43] - - [2944, 256, 1, 128] - - [660, 2830.76] + - [722, 2830.76] - - [6784, 4, 1, 1280] - - [717, 645.235] + - [779, 645.235] - - [1024, 64, 1, 1280] - - [615, 3013.25] + - [677, 3013.25] - - [8448, 4, 1, 2816] - - [567, 984.768] + - [629, 984.768] - - [448, 4288, 1, 256] - - [689, 7139.79] + - [751, 7139.79] - - [64, 3584, 1, 3328] - - [613, 5683.27] + - [675, 5683.27] - - [704, 2368, 1, 1280] - - [697, 7045.3] + - [759, 7045.3] - - [1856, 2368, 1, 1280] - - [694, 8327.9] + - [756, 8327.9] - - [2368, 128, 1, 3328] - - [630, 6082.65] + - [692, 6082.65] - - [64, 193600, 1, 64] - - [679, 6747.77] + - [741, 6747.77] - - [1760, 128, 1, 1760] - - [607, 5513.07] + - [669, 5513.07] - - [448, 1408, 1, 256] - - [689, 5591.54] + - [751, 5591.54] - - [1856, 4288, 1, 1280] - - [687, 8647.72] + - [749, 8647.72] - - [64, 5056, 1, 3328] - - [647, 6096.59] + - [709, 6096.59] - - [512, 1500, 1, 2816] - - [689, 7879.3] + - [751, 7879.3] - - [1024, 448, 1, 128] - - [661, 1844.33] + - [723, 1844.33] - - [704, 4, 1, 1280] - - [627, 341.433] + - [689, 341.433] - - [704, 256, 1, 128] - - [661, 1001.34] + - [723, 1001.34] - - [256, 193600, 1, 64] - - [697, 8113.3] + - [759, 8113.3] - - [704, 2944, 1, 128] - - [668, 3747.13] + - [730, 3747.13] - - [1408, 1024, 1, 1280] - - [694, 7080.71] + - [756, 7080.71] - - [704, 6784, 1, 256] - - [712, 6630.47] + - [774, 6630.47] - - [6784, 704, 1, 256] - - [679, 8005.86] + - [741, 8005.86] - - [5056, 1408, 1, 128] - - [670, 4303.13] + - [732, 4303.13] - - [2048, 7000, 1, 2048] - - [687, 9269.2] + - [749, 9269.2] - - [256, 3584, 1, 3328] - - [681, 7334.48] + - [743, 7334.48] - - [5056, 704, 1, 256] - - [689, 7954.12] + - [751, 7954.12] - - [128, 1408, 1, 128] - - [584, 1243.02] + - [646, 1243.02] - - [3584, 4288, 1, 3328] - - [713, 7683.81] + - [775, 7683.81] - - [5888, 1856, 1, 1280] - - [677, 8831.34] + - [739, 8831.34] - - [256, 1408, 1, 256] - - [679, 4352.68] + - [741, 4352.68] - - [5056, 64, 1, 1280] - - [646, 5012.05] + - [708, 5012.05] - - [1024, 704, 1, 256] - - [679, 5710.17] + - [741, 5710.17] - - [64, 256, 1, 128] - - [585, 149.897] + - [647, 149.897] - - [2368, 3584, 1, 1280] - - [687, 8609.68] + - [749, 8609.68] - - [1024, 256, 1, 256] - - [703, 3276.9] + - [765, 3276.9] - - [1856, 4, 1, 1280] - - [601, 497.104] + - [663, 497.104] - - [448, 448, 1, 256] - - [689, 3117.83] + - [751, 3117.83] - - [2944, 3584, 1, 3328] - - [677, 8879.45] + - [739, 8879.45] - - [7680, 32, 1, 2560] - - [647, 5310.24] + - [709, 5310.24] - - [128, 4288, 1, 128] - - [587, 2116.2] + - [649, 2116.2] - - [256, 256, 1, 3328] - - [640, 4774.7] + - [702, 4774.7] - - [128, 1024, 1, 3328] - - [641, 5894.8] + - [703, 5894.8] - - [4, 1408, 1, 3328] - - [632, 552.674] + - [694, 552.674] - - [196, 256, 64, 1024] - - [730, 5218.34] + - [792, 5218.34] - - [6784, 2944, 1, 256] - - [695, 8271.18] + - [757, 8271.18] - - [64, 1856, 1, 1280] - - [646, 4167.96] + - [708, 4167.96] - - [64, 1024, 1, 128] - - [580, 589.188] + - [642, 589.188] - - [1024, 1500, 1, 2560] - - [684, 8407.88] + - [746, 8407.88] - - [1856, 2368, 1, 256] - - [679, 8092.15] + - [741, 8092.15] - - [3584, 256, 1, 128] - - [662, 2607.57] + - [724, 2607.57] - - [3584, 6784, 1, 3328] - - [696, 8558.83] + - [758, 8558.83] - - [256, 1024, 1, 256] - - [689, 3901.78] + - [751, 3901.78] - - [4, 6784, 1, 3328] - - [627, 662.575] + - [689, 662.575] - - [1024, 5888, 1, 3328] - - [687, 9161.76] + - [749, 9161.76] - - [1024, 128, 1, 1280] - - [644, 3942.12] + - [706, 3942.12] - - [3072, 32, 1, 1024] - - [621, 2840.49] + - [683, 2840.49] - - [6144, 24000, 1, 2560] - - [677, 7605.87] + - [739, 7605.87] - - [448, 1024, 1, 256] - - [679, 5062.19] + - [741, 5062.19] - - [5056, 4288, 1, 1280] - - [687, 9090.99] + - [749, 9090.99] - - [5888, 64, 1, 256] - - [689, 4449.78] + - [751, 4449.78] - - [1856, 256, 1, 1280] - - [703, 5834.46] + - [765, 5834.46] - - [64, 5888, 1, 3328] - - [641, 6152.44] + - [703, 6152.44] - - [2368, 2368, 1, 1280] - - [681, 8594.66] + - [743, 8594.66] - - [2944, 5888, 1, 128] - - [666, 4776.19] + - [728, 4776.19] - - [704, 5888, 1, 1280] - - [681, 8435.91] + - [743, 8435.91] - - [2368, 3584, 1, 128] - - [663, 4590.71] + - [725, 4590.71] - - [1856, 5056, 1, 128] - - [674, 4503.48] + - [736, 4503.48] - - [4608, 1, 1, 1536] - - [572, 226.955] + - [634, 226.955] - - [448, 256, 1, 3328] - - [616, 5415.56] + - [678, 5415.56] - - [2944, 6784, 1, 1280] - - [700, 8385.11] + - [762, 8385.11] - - [448, 1856, 1, 128] - - [670, 2618.96] + - [732, 2618.96] - - [128, 1024, 1, 128] - - [579, 940.527] + - [641, 940.527] - - [7680, 4, 1, 2560] - - [603, 985.104] + - [665, 985.104] - - [1024, 704, 1, 1280] - - [689, 7204.56] + - [751, 7204.56] - - [128, 5888, 1, 256] - - [679, 6313.52] + - [741, 6313.52] - - [1024, 5056, 1, 1280] - - [684, 8979.76] + - [746, 8979.76] - - [4288, 1024, 1, 256] - - [676, 7198.29] + - [738, 7198.29] - - [2944, 2368, 1, 128] - - [661, 4624.57] + - [723, 4624.57] - - [704, 704, 1, 3328] - - [702, 5870.71] + - [764, 5870.71] - - [704, 1408, 1, 1280] - - [691, 7680.32] + - [753, 7680.32] - - [5888, 448, 1, 1280] - - [679, 7718.66] + - [741, 7718.66] - - [3584, 256, 1, 3328] - - [684, 7523.88] + - [746, 7523.88] - - [704, 5888, 1, 3328] - - [689, 8196.99] + - [751, 8196.99] - - [704, 1856, 1, 128] - - [667, 3388.43] + - [729, 3388.43] - - [128, 3584, 1, 3328] - - [641, 6626.5] + - [703, 6626.5] - - [4, 4288, 1, 128] - - [714, 159.648] + - [776, 159.648] - - [128, 704, 1, 1280] - - [604, 4038.73] + - [666, 4038.73] - - [3584, 2944, 1, 256] - - [677, 7685.99] + - [739, 7685.99] - - [1856, 128, 1, 3328] - - [633, 6070.63] + - [695, 6070.63] - - [1856, 2368, 1, 3328] - - [694, 8460.62] + - [756, 8460.62] - - [512, 6000, 1, 2816] - - [697, 9019.55] + - [759, 9019.55] - - [2944, 448, 1, 128] - - [660, 3027.73] + - [722, 3027.73] - - [64, 193600, 1, 256] - - [703, 7080.32] + - [765, 7080.32] - - [128, 2944, 1, 1280] - - [679, 5397.87] + - [741, 5397.87] - - [448, 2944, 1, 1280] - - [689, 6996.97] + - [751, 6996.97] - - [512, 24000, 1, 2048] - - [697, 8832.67] + - [759, 8832.67] - - [128, 256, 1, 3328] - - [636, 3531.57] + - [698, 3531.57] - - [1408, 5056, 1, 3328] - - [692, 7969.94] + - [754, 7969.94] - - [1856, 1856, 1, 3328] - - [679, 8140.34] + - [741, 8140.34] - - [3584, 128, 1, 256] - - [689, 4861.05] + - [751, 4861.05] - - [448, 1408, 1, 3328] - - [679, 6353.75] + - [741, 6353.75] - - [2368, 2368, 1, 256] - - [693, 8369.37] + - [755, 8369.37] - - [4288, 4288, 1, 1280] - - [683, 8666.52] + - [745, 8666.52] - - [64, 448, 1, 1280] - - [636, 2591.92] + - [698, 2591.92] - - [5888, 1024, 1, 1280] - - [676, 8526.6] + - [738, 8526.6] - - [704, 1024, 1, 256] - - [689, 4971.8] + - [751, 4971.8] - - [1024, 12544, 1, 256] - - [727, 8611.9] + - [789, 8611.9] - - [448, 4, 1, 256] - - [632, 78.6534] + - [694, 78.6534] - - [5888, 448, 1, 128] - - [663, 3592.03] + - [725, 3592.03] - - [512, 48000, 1, 2560] - - [697, 9237.44] + - [759, 9237.44] - - [8448, 16, 1, 2816] - - [562, 3360.21] + - [624, 3360.21] - - [704, 6784, 1, 3328] - - [698, 7774.95] + - [760, 7774.95] - - [5888, 5888, 1, 1280] - - [684, 9238.25] + - [746, 9238.25] - - [5056, 1024, 1, 1280] - - [712, 8227.88] + - [774, 8227.88] - - [448, 5888, 1, 3328] - - [687, 7777.63] + - [749, 7777.63] - - [3072, 2, 1, 1024] - - [624, 376.383] + - [686, 376.383] - - [1024, 2944, 1, 1280] - - [677, 8650.45] + - [739, 8650.45] - - [5056, 5888, 1, 1280] - - [687, 8861.6] + - [749, 8861.6] - - [4288, 5888, 1, 128] - - [667, 5049.01] + - [729, 5049.01] - - [256, 3584, 1, 256] - - [679, 6314.11] + - [741, 6314.11] - - [256, 4, 1, 1280] - - [723, 163.94] + - [785, 163.94] - - [1408, 3584, 1, 128] - - [667, 4290.22] + - [729, 4290.22] - - [256, 2944, 1, 3328] - - [689, 7620.99] + - [751, 7620.99] - - [448, 3584, 1, 128] - - [667, 3353.9] + - [729, 3353.9] - - [5888, 2944, 1, 1280] - - [677, 9498.31] + - [739, 9498.31] - - [4, 6784, 1, 1280] - - [627, 623.916] + - [689, 623.916] - - [2368, 5888, 1, 128] - - [666, 4840.29] + - [728, 4840.29] - - [35, 8457, 1, 1760] - - [573, 4059.88] + - [635, 4059.88] - - [64, 2944, 1, 128] - - [584, 1310.82] + - [646, 1310.82] - - [2368, 4, 1, 256] - - [718, 369.739] + - [780, 369.739] - - [3584, 5888, 1, 256] - - [695, 7996.33] + - [757, 7996.33] - - [2368, 1024, 1, 128] - - [661, 3915.07] + - [723, 3915.07] - - [2368, 704, 1, 128] - - [661, 3658.97] + - [723, 3658.97] - - [512, 32, 1, 512] - - [650, 1127.6] + - [712, 1127.6] - - [3584, 2368, 1, 128] - - [661, 4462.48] + - [723, 4462.48] - - [5056, 704, 1, 128] - - [660, 4062.21] + - [722, 4062.21] - - [448, 2368, 1, 128] - - [661, 2829.07] + - [723, 2829.07] - - [4, 5056, 1, 256] - - [609, 425.868] + - [671, 425.868] - - [5056, 1408, 1, 3328] - - [694, 8848.92] + - [756, 8848.92] - - [1408, 704, 1, 256] - - [689, 5394.56] + - [751, 5394.56] - - [6784, 1024, 1, 3328] - - [676, 9232.02] + - [738, 9232.02] - - [6784, 2944, 1, 3328] - - [687, 8714.84] + - [749, 8714.84] - - [7680, 1, 1, 2560] - - [623, 248.845] + - [685, 248.845] - - [1856, 1856, 1, 256] - - [688, 7586.58] + - [750, 7586.58] - - [64, 64, 1, 3328] - - [658, 1363.25] + - [720, 1363.25] - - [512, 1, 1, 512] - - [572, 43.2158] + - [634, 43.2158] - - [6784, 2368, 1, 1280] - - [689, 8665.74] + - [751, 8665.74] - - [4608, 2, 1, 1536] - - [572, 452.65] + - [634, 452.65] - - [4288, 3584, 1, 256] - - [697, 8936.7] + - [759, 8936.7] - - [4288, 5888, 1, 1280] - - [694, 8957.15] + - [756, 8957.15] - - [4608, 4, 1, 1536] - - [565, 846.737] + - [627, 846.737] - - [1024, 6000, 1, 1536] - - [687, 8398.54] + - [749, 8398.54] - - [8448, 32, 1, 2816] - - [647, 5343.07] + - [709, 5343.07] - - [448, 2944, 1, 3328] - - [694, 7247.04] + - [756, 7247.04] - - [4288, 1856, 1, 1280] - - [677, 8902.86] + - [739, 8902.86] - - [1856, 2944, 1, 3328] - - [689, 8622.86] + - [751, 8622.86] - - [256, 6784, 1, 3328] - - [689, 8050.77] + - [751, 8050.77] - - [512, 3000, 1, 1536] - - [710, 7108.12] + - [772, 7108.12] - - [64, 5888, 1, 256] - - [702, 3567.74] + - [764, 3567.74] - - [256, 5056, 1, 128] - - [669, 3041.12] + - [731, 3041.12] - - [5056, 1024, 1, 256] - - [693, 8401.47] + - [755, 8401.47] - - [704, 64, 1, 3328] - - [652, 4299.02] + - [714, 4299.02] - - [5056, 1856, 1, 3328] - - [697, 8660.77] + - [759, 8660.77] - - [4, 2944, 1, 3328] - - [627, 618.637] + - [689, 618.637] - - [512, 1500, 1, 2048] - - [709, 5481.22] + - [771, 5481.22] - - [1024, 1, 1, 500000] - - [563, 260.061] + - [625, 260.061] - - [256, 4, 1, 256] - - [627, 50.5123] + - [689, 50.5123] - - [6784, 128, 1, 3328] - - [681, 6950.91] + - [743, 6950.91] - - [4288, 1408, 1, 128] - - [661, 4539.58] + - [723, 4539.58] - - [1856, 5888, 1, 3328] - - [687, 8712.93] + - [749, 8712.93] - - [4288, 5056, 1, 256] - - [693, 8997.15] + - [755, 8997.15] - - [1408, 128, 1, 1280] - - [616, 4599.12] + - [678, 4599.12] - - [4096, 7000, 1, 4096] - - [683, 8555.89] + - [745, 8555.89] - - [5056, 256, 1, 3328] - - [689, 8257.16] + - [751, 8257.16] - - [704, 704, 1, 256] - - [679, 5852.39] + - [741, 5852.39] - - [1024, 3000, 1, 2560] - - [676, 8258.84] + - [738, 8258.84] - - [1024, 5888, 1, 1280] - - [676, 8988.99] + - [738, 8988.99] - - [6784, 2368, 1, 128] - - [662, 4562.25] + - [724, 4562.25] - - [4, 5056, 1, 1280] - - [627, 600.441] + - [689, 600.441] - - [256, 64, 1, 1280] - - [650, 1899.69] + - [712, 1899.69] - - [128, 1856, 1, 1280] - - [689, 5185.76] + - [751, 5185.76] - - [1856, 1024, 1, 1280] - - [694, 7875.95] + - [756, 7875.95] - - [6784, 4288, 1, 1280] - - [697, 8981.18] + - [759, 8981.18] - - [1856, 1856, 1, 1280] - - [678, 7794.71] + - [740, 7794.71] - - [35, 1500, 1, 2048] - - [578, 2192.6] + - [640, 2192.6] - - [3072, 24000, 1, 1024] - - [690, 8690.58] + - [752, 8690.58] - - [1408, 5056, 1, 1280] - - [689, 8427.87] + - [751, 8427.87] - - [4, 2368, 1, 3328] - - [632, 594.422] + - [694, 594.422] - - [5888, 1856, 1, 128] - - [661, 4294.05] + - [723, 4294.05] - - [448, 704, 1, 1280] - - [684, 4136.39] + - [746, 4136.39] - - [448, 6784, 1, 128] - - [662, 3976.2] + - [724, 3976.2] - - [1024, 448, 1, 3328] - - [694, 6376.33] + - [756, 6376.33] - - [2944, 128, 1, 256] - - [679, 4466.26] + - [741, 4466.26] - - [5056, 3584, 1, 128] - - [667, 4997.18] + - [729, 4997.18] - - [5888, 5888, 1, 3328] - - [697, 8870.37] + - [759, 8870.37] - - [6784, 1024, 1, 256] - - [676, 8520.53] + - [738, 8520.53] - - [2944, 2368, 1, 256] - - [713, 6174.59] + - [775, 6174.59] - - [256, 448, 1, 256] - - [689, 1844.33] + - [751, 1844.33] - - [5056, 5888, 1, 3328] - - [678, 8076.65] + - [740, 8076.65] - - [1856, 1024, 1, 256] - - [689, 7188.92] + - [751, 7188.92] - - [512, 48000, 1, 1536] - - [700, 7282.2] + - [762, 7282.2] - - [3584, 448, 1, 1280] - - [679, 6869.1] + - [741, 6869.1] - - [1024, 1024, 1, 1280] - - [689, 8027.45] + - [751, 8027.45] - - [448, 5888, 1, 256] - - [679, 5765.84] + - [741, 5765.84] - - [2048, 128, 1, 2048] - - [637, 4835.01] + - [699, 4835.01] - - [1408, 6784, 1, 3328] - - [689, 8613.76] + - [751, 8613.76] - - [448, 1024, 1, 128] - - [660, 2315.57] + - [722, 2315.57] - - [4288, 704, 1, 128] - - [661, 4138.92] + - [723, 4138.92] - - [128, 1856, 1, 128] - - [596, 1397.56] + - [658, 1397.56] - - [448, 2368, 1, 3328] - - [679, 6786.48] + - [741, 6786.48] - - [5056, 64, 1, 128] - - [661, 1664.84] + - [723, 1664.84] - - [5056, 2944, 1, 256] - - [712, 7697.49] + - [774, 7697.49] - - [6784, 5888, 1, 128] - - [661, 5003.67] + - [723, 5003.67] - - [1024, 700, 1, 512] - - [689, 6036.31] + - [751, 6036.31] - - [3072, 1, 1, 128] - - [643, 70.3171] + - [705, 70.3171] - - [1024, 4, 1, 256] - - [601, 154.302] + - [663, 154.302] - - [2944, 704, 1, 128] - - [667, 3697.0] + - [729, 3697.0] - - [128, 6784, 1, 1280] - - [679, 6731.51] + - [741, 6731.51] - - [1408, 3584, 1, 3328] - - [677, 9258.07] + - [739, 9258.07] - - [2368, 6784, 1, 256] - - [676, 8840.4] + - [738, 8840.4] - - [5056, 1408, 1, 1280] - - [677, 9240.84] + - [739, 9240.84] - - [5056, 4288, 1, 128] - - [672, 4309.18] + - [734, 4309.18] - - [4, 704, 1, 256] - - [627, 130.697] + - [689, 130.697] - - [4288, 2368, 1, 3328] - - [690, 8755.33] + - [752, 8755.33] - - [1408, 1856, 1, 128] - - [660, 3918.75] + - [722, 3918.75] - - [1408, 5888, 1, 3328] - - [697, 8910.47] + - [759, 8910.47] - - [1856, 256, 1, 256] - - [679, 5631.34] + - [741, 5631.34] - - [6784, 6784, 1, 256] - - [687, 9298.76] + - [749, 9298.76] - - [5888, 5056, 1, 128] - - [662, 4811.36] + - [724, 4811.36] - - [4288, 2368, 1, 128] - - [661, 4749.1] + - [723, 4749.1] - - [128, 5888, 1, 1280] - - [688, 6393.86] + - [750, 6393.86] - - [256, 4288, 1, 1280] - - [679, 6887.79] + - [741, 6887.79] - - [2368, 2944, 1, 256] - - [693, 8314.82] + - [755, 8314.82] - - [4, 1856, 1, 256] - - [716, 267.03] + - [778, 267.03] - - [3584, 1856, 1, 1280] - - [677, 8631.91] + - [739, 8631.91] - - [6784, 6784, 1, 128] - - [667, 5059.96] + - [729, 5059.96] - - [256, 1856, 1, 128] - - [660, 1858.82] + - [722, 1858.82] - - [49, 512, 64, 2048] - - [731, 3053.67] + - [793, 3053.67] - - [704, 64, 1, 1280] - - [610, 2849.49] + - [672, 2849.49] - - [5888, 5056, 1, 256] - - [696, 8202.52] + - [758, 8202.52] - - [8448, 48000, 1, 2816] - - [687, 4281.94] + - [749, 4281.94] - - [512, 6000, 1, 2048] - - [679, 8047.89] + - [741, 8047.89] - - [3584, 448, 1, 256] - - [689, 6805.43] + - [751, 6805.43] - - [448, 4288, 1, 128] - - [667, 3500.83] + - [729, 3500.83] - - [7680, 64, 1, 2560] - - [622, 5957.9] + - [684, 5957.9] - - [256, 6784, 1, 256] - - [689, 7331.83] + - [751, 7331.83] - - [1408, 4288, 1, 128] - - [661, 4501.49] + - [723, 4501.49] - - [2944, 704, 1, 3328] - - [689, 8439.7] + - [751, 8439.7] - - [128, 448, 1, 256] - - [610, 1555.19] + - [672, 1555.19] - - [2048, 32, 1, 2048] - - [621, 3226.49] + - [683, 3226.49] - - [3584, 3584, 1, 256] - - [693, 8784.9] + - [755, 8784.9] - - [448, 1408, 1, 128] - - [660, 2535.92] + - [722, 2535.92] - - [128, 256, 1, 1280] - - [636, 2896.72] + - [698, 2896.72] - - [3584, 5056, 1, 256] - - [680, 8566.52] + - [742, 8566.52] - - [6784, 128, 1, 256] - - [679, 6053.97] + - [741, 6053.97] - - [4288, 4, 1, 256] - - [599, 428.9] + - [661, 428.9] - - [64, 1408, 1, 3328] - - [604, 5025.11] + - [666, 5025.11] - - [704, 448, 1, 256] - - [703, 3409.74] + - [765, 3409.74] - - [2944, 2368, 1, 1280] - - [677, 9066.35] + - [739, 9066.35] - - [448, 64, 1, 3328] - - [652, 3528.96] + - [714, 3528.96] - - [704, 6784, 1, 128] - - [666, 4212.61] + - [728, 4212.61] - - [3584, 4, 1, 3328] - - [719, 658.353] + - [781, 658.353] - - [6784, 3584, 1, 256] - - [687, 9061.84] + - [749, 9061.84] - - [704, 448, 1, 128] - - [666, 1552.8] + - [728, 1552.8] - - [256, 128, 1, 128] - - [591, 281.975] + - [653, 281.975] - - [704, 1408, 1, 128] - - [666, 3026.76] + - [728, 3026.76] - - [4, 448, 1, 128] - - [715, 5.56127] + - [777, 5.56127] - - [4288, 128, 1, 1280] - - [646, 5471.64] + - [708, 5471.64] - - [128, 1408, 1, 256] - - [689, 2813.35] + - [751, 2813.35] - - [4, 2944, 1, 256] - - [609, 316.766] + - [671, 316.766] - - [64, 128, 1, 3328] - - [657, 1872.56] + - [719, 1872.56] - - [1856, 1408, 1, 256] - - [679, 7735.89] + - [741, 7735.89] - - [5056, 2368, 1, 128] - - [661, 4830.19] + - [723, 4830.19] - - [2944, 2944, 1, 3328] - - [697, 8890.11] + - [759, 8890.11] - - [5056, 6784, 1, 256] - - [687, 9015.25] + - [749, 9015.25] - - [1856, 3584, 1, 128] - - [668, 4455.12] + - [730, 4455.12] - - [5888, 4, 1, 1280] - - [717, 642.063] + - [779, 642.063] - - [128, 2944, 1, 128] - - [586, 2037.03] + - [648, 2037.03] - - [35, 8457, 1, 2560] - - [574, 3988.23] + - [636, 3988.23] - - [3584, 6784, 1, 128] - - [661, 4774.54] + - [723, 4774.54] - - [128, 4288, 1, 256] - - [679, 4851.85] + - [741, 4851.85] - - [704, 448, 1, 3328] - - [694, 4432.63] + - [756, 4432.63] - - [2368, 6784, 1, 1280] - - [677, 9161.48] + - [739, 9161.48] - - [128, 128, 1, 3328] - - [651, 2839.99] + - [713, 2839.99] - - [5056, 1856, 1, 256] - - [693, 8380.94] + - [755, 8380.94] - - [256, 128, 1, 256] - - [635, 1165.18] + - [697, 1165.18] - - [1024, 3000, 1, 2816] - - [694, 8714.27] + - [756, 8714.27] - - [1024, 1856, 1, 256] - - [684, 7014.79] + - [746, 7014.79] - - [64, 1, 1, 1216] - - [657, 11.8205] + - [719, 11.8205] - - [4288, 64, 1, 128] - - [588, 1669.65] + - [650, 1669.65] - - [256, 448, 1, 3328] - - [612, 5152.39] + - [674, 5152.39] - - [1408, 6784, 1, 1280] - - [697, 8735.22] + - [759, 8735.22] - - [3584, 3584, 1, 1280] - - [694, 9020.09] + - [756, 9020.09] - - [7680, 24000, 1, 2560] - - [697, 6940.24] + - [759, 6940.24] - - [64, 2368, 1, 1280] - - [607, 4433.07] + - [669, 4433.07] - - [448, 2368, 1, 1280] - - [682, 5352.92] + - [744, 5352.92] - - [4608, 48000, 1, 1536] - - [676, 8129.11] + - [738, 8129.11] - - [5888, 5888, 1, 128] - - [669, 4700.91] + - [731, 4700.91] - - [64, 6784, 1, 3328] - - [679, 6170.82] + - [741, 6170.82] - - [2944, 256, 1, 1280] - - [709, 6177.65] + - [771, 6177.65] - - [2048, 16, 1, 2048] - - [631, 2167.7] + - [693, 2167.7] - - [256, 2368, 1, 128] - - [660, 2037.77] + - [722, 2037.77] - - [5056, 2368, 1, 3328] - - [677, 9040.6] + - [739, 9040.6] - - [2944, 4288, 1, 256] - - [708, 7552.22] + - [770, 7552.22] - - [1408, 3584, 1, 1280] - - [684, 8808.76] + - [746, 8808.76] - - [2368, 64, 1, 256] - - [620, 2320.51] + - [682, 2320.51] - - [1024, 128, 1, 128] - - [580, 1075.56] + - [642, 1075.56] - - [704, 128, 1, 3328] - - [613, 4985.02] + - [675, 4985.02] - - [5888, 4, 1, 128] - - [714, 33.6558] + - [776, 33.6558] - - [1856, 704, 1, 256] - - [689, 7110.98] + - [751, 7110.98] - - [1024, 1500, 1, 2816] - - [684, 8499.88] + - [746, 8499.88] - - [8448, 1, 1, 2816] - - [567, 251.469] + - [629, 251.469] - - [1024, 4, 1, 3328] - - [723, 541.032] + - [785, 541.032] - - [1024, 6000, 1, 2048] - - [684, 8698.59] + - [746, 8698.59] - - [512, 24000, 1, 2560] - - [677, 8963.7] + - [739, 8963.7] - - [6144, 3000, 1, 2560] - - [700, 8761.97] + - [762, 8761.97] - - [2368, 6784, 1, 3328] - - [694, 8867.49] + - [756, 8867.49] - - [1856, 1408, 1, 1280] - - [681, 7908.53] + - [743, 7908.53] - - [1856, 448, 1, 1280] - - [694, 6544.01] + - [756, 6544.01] - - [6784, 704, 1, 128] - - [660, 4086.45] + - [722, 4086.45] - - [4, 4, 1, 256] - - [627, 0.852941] + - [689, 0.852941] - - [128, 5888, 1, 128] - - [584, 2582.25] + - [646, 2582.25] - - [5056, 2944, 1, 128] - - [664, 4579.17] + - [726, 4579.17] - - [1408, 5888, 1, 256] - - [676, 8810.77] + - [738, 8810.77] - - [704, 2944, 1, 1280] - - [691, 8420.9] + - [753, 8420.9] - - [4288, 64, 1, 1280] - - [616, 4906.15] + - [678, 4906.15] - - [256, 64, 1, 256] - - [618, 689.953] + - [680, 689.953] - - [1024, 1024, 1, 256] - - [694, 5528.01] + - [756, 5528.01] - - [704, 1856, 1, 256] - - [678, 4452.92] + - [740, 4452.92] - - [2560, 64, 1, 2560] - - [607, 4563.09] + - [669, 4563.09] - - [3584, 704, 1, 1280] - - [684, 7898.77] + - [746, 7898.77] - - [256, 128, 1, 1280] - - [636, 2865.06] + - [698, 2865.06] - - [5888, 2368, 1, 256] - - [683, 8628.37] + - [745, 8628.37] - - [256, 2368, 1, 1280] - - [679, 6073.57] + - [741, 6073.57] - - [2944, 6784, 1, 128] - - [660, 4756.77] + - [722, 4756.77] - - [3584, 448, 1, 3328] - - [679, 7265.07] + - [741, 7265.07] - - [1408, 4, 1, 256] - - [720, 234.157] + - [782, 234.157] - - [704, 2368, 1, 3328] - - [677, 7248.98] + - [739, 7248.98] - - [2944, 448, 1, 256] - - [684, 6365.89] + - [746, 6365.89] - - [1856, 448, 1, 128] - - [662, 2976.34] + - [724, 2976.34] - - [4608, 6000, 1, 1536] - - [697, 9469.42] + - [759, 9469.42] - - [2368, 128, 1, 1280] - - [646, 4773.39] + - [708, 4773.39] - - [256, 5888, 1, 128] - - [661, 3112.0] + - [723, 3112.0] - - [64, 6784, 1, 256] - - [679, 3755.14] + - [741, 3755.14] - - [64, 5056, 1, 1280] - - [640, 4935.6] + - [702, 4935.6] - - [4, 6784, 1, 128] - - [715, 111.142] + - [777, 111.142] - - [3025, 64, 64, 64] - - [729, 6643.75] + - [791, 6643.75] - - [2944, 2944, 1, 1280] - - [677, 8869.55] + - [739, 8869.55] - - [5056, 448, 1, 3328] - - [710, 6706.2] + - [772, 6706.2] - - [4, 3584, 1, 1280] - - [627, 573.54] + - [689, 573.54] - - [1408, 128, 1, 128] - - [579, 1293.19] + - [641, 1293.19] - - [6784, 704, 1, 3328] - - [694, 8368.33] + - [756, 8368.33] - - [128, 64, 1, 1280] - - [653, 1260.41] + - [715, 1260.41] - - [2368, 256, 1, 1280] - - [679, 6154.47] + - [741, 6154.47] - - [4, 448, 1, 3328] - - [632, 351.738] + - [694, 351.738] - - [5888, 4288, 1, 128] - - [661, 4340.99] + - [723, 4340.99] - - [4, 5888, 1, 256] - - [609, 428.318] + - [671, 428.318] - - [1408, 2944, 1, 3328] - - [676, 9400.85] + - [738, 9400.85] - - [3584, 704, 1, 128] - - [663, 3392.55] + - [725, 3392.55] - - [64, 1024, 1, 256] - - [610, 1762.41] + - [672, 1762.41] - - [2368, 448, 1, 1280] - - [703, 5972.58] + - [765, 5972.58] - - [128, 3584, 1, 256] - - [679, 5224.32] + - [741, 5224.32] - - [704, 448, 1, 1280] - - [679, 4566.86] + - [741, 4566.86] - - [448, 5056, 1, 128] - - [661, 3876.19] + - [723, 3876.19] - - [6144, 4, 1, 2560] - - [603, 948.751] + - [665, 948.751] - - [5056, 3584, 1, 256] - - [693, 8162.56] + - [755, 8162.56] - - [4288, 4288, 1, 256] - - [700, 7653.34] + - [762, 7653.34] - - [1408, 5056, 1, 128] - - [667, 4554.34] + - [729, 4554.34] - - [2944, 3584, 1, 128] - - [673, 4147.0] + - [735, 4147.0] - - [3584, 2368, 1, 256] - - [694, 8195.05] + - [756, 8195.05] - - [5888, 5056, 1, 1280] - - [693, 9413.43] + - [755, 9413.43] - - [128, 1024, 1, 1280] - - [646, 4433.83] + - [708, 4433.83] - - [8448, 24000, 1, 2816] - - [687, 5227.12] + - [749, 5227.12] - - [64, 704, 1, 256] - - [610, 1441.89] + - [672, 1441.89] - - [4288, 256, 1, 1280] - - [709, 5687.8] + - [771, 5687.8] - - [3584, 3584, 1, 3328] - - [684, 9183.63] + - [746, 9183.63] - - [704, 64, 1, 128] - - [588, 402.835] + - [650, 402.835] - - [3072, 1500, 1, 128] - - [683, 7395.08] + - [745, 7395.08] - - [2048, 3136, 1, 512] - - [725, 8447.3] + - [787, 8447.3] - - [3025, 256, 64, 64] - - [733, 8063.79] + - [795, 8063.79] - - [5888, 6784, 1, 256] - - [677, 9282.01] + - [739, 9282.01] - - [4288, 2944, 1, 3328] - - [677, 9153.87] + - [739, 9153.87] - - [2944, 64, 1, 128] - - [594, 1463.53] + - [656, 1463.53] - - [1024, 128, 1, 3328] - - [644, 5377.41] + - [706, 5377.41] - - [1024, 16, 1, 500000] - - [560, 3997.13] + - [622, 3997.13] - - [4288, 128, 1, 3328] - - [648, 6053.31] + - [710, 6053.31] - - [7680, 128, 1, 2560] - - [694, 7769.24] + - [756, 7769.24] - - [256, 5056, 1, 1280] - - [703, 7200.84] + - [765, 7200.84] - - [1408, 256, 1, 128] - - [671, 1671.74] + - [733, 1671.74] - - [2944, 5888, 1, 3328] - - [683, 8642.18] + - [745, 8642.18] - - [6784, 5888, 1, 1280] - - [697, 8871.15] + - [759, 8871.15] - - [3072, 1, 1, 1024] - - [643, 205.972] + - [705, 205.972] - - [704, 128, 1, 256] - - [606, 1935.39] + - [668, 1935.39] - - [5888, 4288, 1, 1280] - - [684, 9176.7] + - [746, 9176.7] - - [1024, 24000, 1, 2048] - - [683, 8667.79] + - [745, 8667.79] - - [448, 256, 1, 1280] - - [616, 4327.95] + - [678, 4327.95] - - [5888, 3584, 1, 128] - - [661, 4669.45] + - [723, 4669.45] - - [64, 4288, 1, 3328] - - [641, 5375.04] + - [703, 5375.04] - - [448, 4, 1, 1280] - - [632, 289.716] + - [694, 289.716] - - [6784, 6784, 1, 3328] - - [690, 8306.73] + - [752, 8306.73] - - [5056, 4, 1, 1280] - - [602, 607.199] + - [664, 607.199] - - [4, 5888, 1, 3328] - - [627, 651.538] + - [689, 651.538] - - [256, 1408, 1, 1280] - - [679, 5177.09] + - [741, 5177.09] - - [3072, 16, 1, 1024] - - [638, 2207.63] + - [700, 2207.63] - - [704, 3584, 1, 128] - - [671, 3653.51] + - [733, 3653.51] - - [1024, 2, 1, 512] - - [658, 156.138] + - [720, 156.138] - - [5888, 448, 1, 3328] - - [679, 7896.85] + - [741, 7896.85] - - [2368, 4288, 1, 1280] - - [676, 8517.63] + - [738, 8517.63] - - [4288, 2944, 1, 128] - - [665, 4439.26] + - [727, 4439.26] - - [256, 64, 1, 3328] - - [651, 2704.76] + - [713, 2704.76] - - [2944, 64, 1, 3328] - - [616, 5647.15] + - [678, 5647.15] - - [6784, 64, 1, 3328] - - [689, 6434.61] + - [751, 6434.61] - - [5056, 2944, 1, 3328] - - [700, 8497.2] + - [762, 8497.2] - - [448, 128, 1, 256] - - [618, 1516.64] + - [680, 1516.64] - - [2944, 3584, 1, 256] - - [694, 8365.83] + - [756, 8365.83] - - [1408, 1408, 1, 3328] - - [677, 8440.42] + - [739, 8440.42] - - [1856, 128, 1, 1280] - - [679, 5242.93] + - [741, 5242.93] - - [3584, 3584, 1, 128] - - [661, 4385.94] + - [723, 4385.94] - - [64, 3584, 1, 256] - - [679, 3276.9] + - [741, 3276.9] - - [1408, 4, 1, 3328] - - [602, 605.504] + - [664, 605.504] - - [128, 2944, 1, 3328] - - [647, 6295.75] + - [709, 6295.75] - - [3584, 704, 1, 256] - - [684, 7711.64] + - [746, 7711.64] - - [2944, 448, 1, 3328] - - [695, 6503.97] + - [757, 6503.97] - - [1024, 2, 1, 500000] - - [564, 521.803] + - [626, 521.803] - - [3584, 1408, 1, 3328] - - [686, 8296.2] + - [748, 8296.2] - - [704, 3584, 1, 1280] - - [691, 7670.65] + - [753, 7670.65] - - [1024, 1408, 1, 128] - - [666, 2830.61] + - [728, 2830.61] - - [1856, 6784, 1, 256] - - [697, 8149.67] + - [759, 8149.67] - - [4288, 448, 1, 3328] - - [678, 7406.44] + - [740, 7406.44] - - [6784, 4288, 1, 128] - - [673, 4418.09] + - [735, 4418.09] - - [6784, 704, 1, 1280] - - [694, 8302.45] + - [756, 8302.45] - - [6144, 1, 1, 2560] - - [603, 243.427] + - [665, 243.427] - - [3584, 6784, 1, 256] - - [676, 9036.59] + - [738, 9036.59] - - [6144, 16, 1, 2560] - - [610, 3266.69] + - [672, 3266.69] - - [3584, 64, 1, 128] - - [594, 1555.19] + - [656, 1555.19] - - [5888, 1024, 1, 3328] - - [684, 8888.08] + - [746, 8888.08] - - [448, 64, 1, 128] - - [580, 248.074] + - [642, 248.074] - - [704, 6784, 1, 1280] - - [680, 7892.56] + - [742, 7892.56] - - [4, 448, 1, 256] - - [602, 70.8951] + - [664, 70.8951] - - [196, 1024, 64, 256] - - [728, 6630.86] + - [790, 6630.86] - - [5888, 128, 1, 256] - - [678, 5715.09] + - [740, 5715.09] - - [4096, 16, 1, 4096] - - [624, 3251.5] + - [686, 3251.5] - - [1856, 5056, 1, 3328] - - [693, 8740.27] + - [755, 8740.27] - - [4, 6784, 1, 256] - - [716, 360.412] + - [778, 360.412] - - [1024, 3584, 1, 128] - - [661, 3456.27] + - [723, 3456.27] - - [64, 704, 1, 3328] - - [629, 3817.47] + - [691, 3817.47] - - [2368, 2944, 1, 128] - - [667, 4605.47] + - [729, 4605.47] - - [5056, 64, 1, 256] - - [679, 3863.79] + - [741, 3863.79] - - [512, 1500, 1, 1536] - - [679, 6801.56] + - [741, 6801.56] - - [512, 1, 1, 500000] - - [568, 261.068] + - [630, 261.068] - - [5888, 2944, 1, 3328] - - [683, 8501.88] + - [745, 8501.88] - - [128, 3584, 1, 1280] - - [684, 5938.64] + - [746, 5938.64] - - [1024, 704, 1, 128] - - [670, 2172.29] + - [732, 2172.29] - - [1408, 2368, 1, 128] - - [666, 4023.2] + - [728, 4023.2] - - [5888, 2368, 1, 128] - - [667, 4424.62] + - [729, 4424.62] - - [128, 5056, 1, 3328] - - [679, 6692.16] + - [741, 6692.16] - - [3584, 6784, 1, 1280] - - [677, 9488.64] + - [739, 9488.64] - - [4288, 1856, 1, 256] - - [687, 8287.52] + - [749, 8287.52] - - [1856, 5888, 1, 256] - - [698, 7707.83] + - [760, 7707.83] - - [256, 256, 1, 256] - - [645, 1613.29] + - [707, 1613.29] - - [4288, 4288, 1, 3328] - - [687, 8923.59] + - [749, 8923.59] - - [1024, 1024, 1, 128] - - [667, 2553.71] + - [729, 2553.71] - - [4288, 1408, 1, 1280] - - [687, 8930.47] + - [749, 8930.47] - - [3584, 5056, 1, 128] - - [671, 4495.15] + - [733, 4495.15] - - [4, 1024, 1, 3328] - - [627, 415.694] + - [689, 415.694] - - [4, 704, 1, 128] - - [715, 13.9634] + - [777, 13.9634] - - [4288, 2368, 1, 256] - - [712, 7135.08] + - [774, 7135.08] - - [2944, 5056, 1, 1280] - - [684, 9118.61] + - [746, 9118.61] - - [448, 6784, 1, 256] - - [708, 5430.31] + - [770, 5430.31] - - [64, 128, 1, 128] - - [591, 83.057] + - [653, 83.057] - - [1856, 2368, 1, 128] - - [667, 4422.75] + - [729, 4422.75] - - [6784, 2368, 1, 3328] - - [680, 8769.4] + - [742, 8769.4] - - [1408, 6784, 1, 128] - - [667, 4739.0] + - [729, 4739.0] - - [256, 1024, 1, 1280] - - [689, 5722.21] + - [751, 5722.21] - - [704, 4, 1, 128] - - [715, 8.66578] + - [777, 8.66578] - - [1408, 4, 1, 128] - - [715, 26.1439] + - [777, 26.1439] - - [4288, 128, 1, 256] - - [689, 4865.38] + - [751, 4865.38] - - [4288, 1856, 1, 3328] - - [676, 9250.04] + - [738, 9250.04] - - [3584, 448, 1, 128] - - [667, 3029.59] + - [729, 3029.59] - - [64, 4288, 1, 128] - - [584, 1535.38] + - [646, 1535.38] - - [64, 448, 1, 3328] - - [654, 3457.36] + - [716, 3457.36] - - [448, 4, 1, 3328] - - [632, 367.328] + - [694, 367.328] - - [256, 4, 1, 3328] - - [723, 320.389] + - [785, 320.389] - - [4, 1408, 1, 1280] - - [720, 344.039] + - [782, 344.039] - - [3584, 64, 1, 1280] - - [608, 5191.07] + - [670, 5191.07] - - [1408, 448, 1, 128] - - [668, 2218.24] + - [730, 2218.24] - - [3584, 1024, 1, 1280] - - [690, 8253.11] + - [752, 8253.11] - - [1856, 5056, 1, 256] - - [708, 7552.55] + - [770, 7552.55] - - [4, 3584, 1, 256] - - [627, 325.456] + - [689, 325.456] - - [6784, 4288, 1, 3328] - - [683, 8655.34] + - [745, 8655.34] - - [4, 2944, 1, 1280] - - [627, 547.821] + - [689, 547.821] - - [1024, 4288, 1, 256] - - [684, 7788.83] + - [746, 7788.83] - - [5888, 3584, 1, 3328] - - [687, 9173.39] + - [749, 9173.39] - - [1856, 4, 1, 256] - - [718, 282.919] + - [780, 282.919] - - [4, 256, 1, 256] - - [627, 49.7485] + - [689, 49.7485] - - [5056, 3584, 1, 3328] - - [693, 8457.53] + - [755, 8457.53] - - [1408, 128, 1, 3328] - - [647, 5714.52] + - [709, 5714.52] - - [4, 64, 1, 1280] - - [723, 42.7667] + - [785, 42.7667] - - [2368, 1408, 1, 1280] - - [684, 8224.92] + - [746, 8224.92] - - [5056, 2944, 1, 1280] - - [676, 9295.13] + - [738, 9295.13] - - [8448, 6000, 1, 2816] - - [680, 8037.97] + - [742, 8037.97] - - [4, 4, 1, 128] - - [715, 0.1433898] + - [777, 0.1433898] - - [3584, 256, 1, 256] - - [679, 6116.79] + - [741, 6116.79] - - [3584, 2944, 1, 1280] - - [676, 8796.49] + - [738, 8796.49] - - [1024, 6784, 1, 256] - - [683, 8187.86] + - [745, 8187.86] - - [4, 128, 1, 256] - - [627, 30.4407] + - [689, 30.4407] - - [6784, 448, 1, 256] - - [679, 7862.3] + - [741, 7862.3] - - [5124, 9124, 1, 2048] - - [681, 8176.41] + - [743, 8176.41] - - [2944, 5056, 1, 3328] - - [676, 9328.34] + - [738, 9328.34] - - [6784, 4, 1, 128] - - [714, 204.9] + - [776, 204.9] - - [2944, 1408, 1, 128] - - [665, 3838.2] + - [727, 3838.2] - - [448, 128, 1, 3328] - - [630, 4632.16] + - [692, 4632.16] - - [64, 2944, 1, 3328] - - [647, 5663.47] + - [709, 5663.47] - - [5056, 6784, 1, 3328] - - [683, 8420.17] + - [745, 8420.17] - - [704, 2368, 1, 128] - - [667, 3321.79] + - [729, 3321.79] - - [3072, 1500, 1, 1024] - - [684, 8221.77] + - [746, 8221.77] - - [128, 2944, 1, 256] - - [679, 4550.52] + - [741, 4550.52] - - [128, 6784, 1, 128] - - [584, 2767.76] + - [646, 2767.76] - - [3584, 4288, 1, 256] - - [683, 8808.64] + - [745, 8808.64] - - [448, 1856, 1, 256] - - [688, 5166.63] + - [750, 5166.63] - - [1856, 6784, 1, 3328] - - [680, 8339.76] + - [742, 8339.76] - - [3584, 128, 1, 3328] - - [689, 6791.57] + - [751, 6791.57] - - [64, 1856, 1, 256] - - [611, 2210.03] + - [673, 2210.03] - - [64, 448, 1, 256] - - [643, 1008.35] + - [705, 1008.35] - - [5888, 4288, 1, 256] - - [683, 8869.63] + - [745, 8869.63] - - [128, 1500, 1, 1280] - - [640, 4733.54] + - [702, 4733.54] - - [5056, 1408, 1, 256] - - [681, 7523.31] + - [743, 7523.31] - - [35, 8457, 1, 4096] - - [574, 4023.17] + - [636, 4023.17] - - [64, 256, 1, 1280] - - [635, 1941.91] + - [697, 1941.91] - - [2944, 4, 1, 128] - - [714, 95.7426] + - [776, 95.7426] - - [3584, 1024, 1, 256] - - [706, 6553.68] + - [768, 6553.68] - - [512, 6000, 1, 1536] - - [680, 7357.25] + - [742, 7357.25] - - [256, 704, 1, 256] - - [679, 2912.81] + - [741, 2912.81] - - [5888, 5888, 1, 256] - - [690, 8802.7] + - [752, 8802.7] - - [4288, 1024, 1, 1280] - - [683, 8248.83] + - [745, 8248.83] - - [5888, 128, 1, 3328] - - [633, 6848.59] + - [695, 6848.59] - - [448, 6784, 1, 3328] - - [679, 8343.78] + - [741, 8343.78] - - [2944, 1408, 1, 1280] - - [676, 9229.48] + - [738, 9229.48] - - [3072, 6000, 1, 1024] - - [697, 9015.01] + - [759, 9015.01] - - [1024, 32, 1, 512] - - [618, 1498.07] + - [680, 1498.07] - - [2944, 1856, 1, 3328] - - [693, 7176.48] + - [755, 7176.48] - - [2368, 64, 1, 128] - - [584, 1206.48] + - [646, 1206.48] - - [256, 1024, 1, 128] - - [661, 1178.28] + - [723, 1178.28] - - [3584, 5888, 1, 1280] - - [683, 9023.58] + - [745, 9023.58] - - [64, 4, 1, 128] - - [715, 1.089372] + - [777, 1.089372] - - [6784, 1856, 1, 1280] - - [677, 8964.51] + - [739, 8964.51] - - [2944, 5056, 1, 256] - - [683, 8860.12] + - [745, 8860.12] - - [5888, 256, 1, 3328] - - [694, 8308.66] + - [756, 8308.66] - - [2944, 4288, 1, 128] - - [662, 4507.61] + - [724, 4507.61] - - [3584, 1408, 1, 256] - - [677, 8234.71] + - [739, 8234.71] - - [704, 3584, 1, 3328] - - [689, 7377.26] + - [751, 7377.26] - - [5056, 448, 1, 1280] - - [678, 7145.47] + - [740, 7145.47] - - [3584, 1856, 1, 3328] - - [694, 8954.81] + - [756, 8954.81] - - [64, 1408, 1, 128] - - [591, 731.974] + - [653, 731.974] - - [4288, 6784, 1, 1280] - - [683, 9166.55] + - [745, 9166.55] - - [1024, 3000, 1, 2048] - - [694, 7723.83] + - [756, 7723.83] - - [1408, 704, 1, 1280] - - [684, 7863.1] + - [746, 7863.1] - - [2944, 1024, 1, 256] - - [677, 5035.02] + - [739, 5035.02] - - [256, 64, 1, 128] - - [583, 150.757] + - [645, 150.757] - - [2368, 4288, 1, 3328] - - [681, 8568.84] + - [743, 8568.84] - - [4, 1408, 1, 256] - - [627, 219.885] + - [689, 219.885] - - [1024, 1408, 1, 1280] - - [709, 6761.13] + - [771, 6761.13] - - [64, 64, 1, 256] - - [609, 198.694] + - [671, 198.694] - - [704, 256, 1, 3328] - - [679, 4291.62] + - [741, 4291.62] - - [6784, 5056, 1, 256] - - [678, 8545.02] + - [740, 8545.02] - - [1856, 1856, 1, 128] - - [666, 4034.93] + - [728, 4034.93] - - [4288, 5888, 1, 256] - - [697, 8998.05] + - [759, 8998.05] - - [4, 704, 1, 3328] - - [632, 452.4] + - [694, 452.4] - - [35, 8457, 1, 2048] - - [575, 3375.37] + - [637, 3375.37] - - [448, 2944, 1, 256] - - [679, 6346.74] + - [741, 6346.74] - - [4, 4288, 1, 3328] - - [632, 630.978] + - [694, 630.978] - - [2944, 6784, 1, 256] - - [706, 8002.92] + - [768, 8002.92] - - [2944, 2944, 1, 128] - - [661, 4661.41] + - [723, 4661.41] - - [4, 4, 1, 1280] - - [632, 3.14762] + - [694, 3.14762] - - [1856, 3584, 1, 1280] - - [676, 8677.66] + - [738, 8677.66] - - [64, 2944, 1, 256] - - [679, 2926.95] + - [741, 2926.95] - - [3584, 1408, 1, 1280] - - [690, 8238.9] + - [752, 8238.9] - - [448, 256, 1, 128] - - [591, 1042.72] + - [653, 1042.72] - - [4288, 448, 1, 128] - - [667, 3698.82] + - [729, 3698.82] - - [5056, 256, 1, 1280] - - [684, 7058.5] + - [746, 7058.5] - - [1856, 1408, 1, 3328] - - [681, 8348.35] + - [743, 8348.35] - - [128, 128, 1, 128] - - [591, 145.736] + - [653, 145.736] - - [1024, 4288, 1, 3328] - - [677, 8042.61] + - [739, 8042.61] - - [448, 2368, 1, 256] - - [689, 5935.0] + - [751, 5935.0] - - [1024, 4, 1, 128] - - [715, 15.93] + - [777, 15.93] - - [64, 1408, 1, 1280] - - [613, 3865.49] + - [675, 3865.49] - - [64, 6784, 1, 1280] - - [709, 5629.61] + - [771, 5629.61] - - [5056, 448, 1, 256] - - [679, 7637.91] + - [741, 7637.91] - - [2944, 2368, 1, 3328] - - [687, 9112.44] + - [749, 9112.44] - - [704, 4288, 1, 3328] - - [679, 7950.2] + - [741, 7950.2] - - [1408, 128, 1, 256] - - [679, 2898.17] + - [741, 2898.17] - - [1024, 1856, 1, 1280] - - [677, 8087.51] + - [739, 8087.51] - - [6784, 1856, 1, 256] - - [708, 7538.25] + - [770, 7538.25] - - [512, 48000, 1, 2816] - - [676, 9704.21] + - [738, 9704.21] - - [512, 3000, 1, 2816] - - [678, 7621.63] + - [740, 7621.63] - - [128, 2368, 1, 3328] - - [641, 6038.94] + - [703, 6038.94] - - [1024, 5888, 1, 256] - - [693, 8185.82] + - [755, 8185.82] - - [64, 2944, 1, 1280] - - [640, 4540.24] + - [702, 4540.24] - - [6784, 1408, 1, 256] - - [693, 8574.0] + - [755, 8574.0] - - [5056, 64, 1, 3328] - - [641, 6310.97] + - [703, 6310.97] - - [128, 704, 1, 128] - - [580, 696.618] + - [642, 696.618] - - [1408, 2368, 1, 256] - - [679, 4995.06] + - [741, 4995.06] - - [1408, 1408, 1, 256] - - [676, 7552.34] + - [738, 7552.34] - - [4, 64, 1, 128] - - [714, 1.90441] + - [776, 1.90441] - - [64, 128, 1, 1280] - - [653, 1272.64] + - [715, 1272.64] - - [1024, 8, 1, 500000] - - [561, 2013.23] + - [623, 2013.23] - - [4, 2368, 1, 128] - - [715, 49.9526] + - [777, 49.9526] - - [2368, 2368, 1, 128] - - [666, 4483.8] + - [728, 4483.8] - - [64, 5888, 1, 128] - - [583, 1957.67] + - [645, 1957.67] - - [5888, 4, 1, 3328] - - [716, 638.798] + - [778, 638.798] - - [6784, 1408, 1, 128] - - [661, 4715.61] + - [723, 4715.61] - - [1408, 5056, 1, 256] - - [693, 8557.67] + - [755, 8557.67] - - [512, 50176, 1, 128] - - [724, 8809.39] + - [786, 8809.39] - - [5056, 128, 1, 3328] - - [616, 6810.66] + - [678, 6810.66] - - [128, 128, 1, 1280] - - [650, 1899.69] + - [712, 1899.69] - - [512, 2, 1, 512] - - [570, 87.4813] + - [632, 87.4813] - - [448, 704, 1, 256] - - [689, 3765.97] + - [751, 3765.97] - - [4288, 3584, 1, 128] - - [674, 4563.77] + - [736, 4563.77] - - [2944, 128, 1, 3328] - - [616, 6507.45] + - [678, 6507.45] - - [128, 5056, 1, 1280] - - [679, 6557.85] + - [741, 6557.85] - - [3584, 5056, 1, 1280] - - [676, 9407.93] + - [738, 9407.93] - - [256, 448, 1, 1280] - - [640, 4096.1] + - [702, 4096.1] - - [704, 704, 1, 128] - - [666, 2374.31] + - [728, 2374.31] - - [5056, 4, 1, 128] - - [714, 125.52] + - [776, 125.52] - - [704, 256, 1, 1280] - - [689, 4016.23] + - [751, 4016.23] - - [64, 2368, 1, 3328] - - [646, 5159.29] + - [708, 5159.29] - - [1856, 1024, 1, 128] - - [666, 3356.47] + - [728, 3356.47] - - [1856, 64, 1, 128] - - [583, 945.644] + - [645, 945.644] - - [4096, 64, 1, 4096] - - [649, 6260.24] + - [711, 6260.24] - - [1024, 24000, 1, 1536] - - [693, 9368.5] + - [755, 9368.5] - - [704, 4288, 1, 256] - - [690, 7329.39] + - [752, 7329.39] - - [5888, 2368, 1, 1280] - - [679, 8624.71] + - [741, 8624.71] - - [6784, 1856, 1, 3328] - - [683, 9012.45] + - [745, 9012.45] - - [64, 128, 1, 256] - - [609, 374.591] + - [671, 374.591] - - [2368, 5888, 1, 1280] - - [677, 9045.76] + - [739, 9045.76] - - [5888, 256, 1, 1280] - - [694, 7999.17] + - [756, 7999.17] - - [4, 5888, 1, 1280] - - [627, 615.839] + - [689, 615.839] - - [704, 128, 1, 128] - - [583, 693.269] + - [645, 693.269] - - [1024, 4, 1, 1280] - - [722, 372.464] + - [784, 372.464] - - [2368, 1856, 1, 3328] - - [694, 8246.91] + - [756, 8246.91] - - [2368, 128, 1, 128] - - [584, 1963.53] + - [646, 1963.53] - - [2944, 704, 1, 256] - - [694, 7116.24] + - [756, 7116.24] - - [5056, 128, 1, 128] - - [587, 2519.49] + - [649, 2519.49] - - [2368, 1024, 1, 3328] - - [679, 7959.13] + - [741, 7959.13] - - [35, 700, 1, 2048] - - [575, 1766.86] + - [637, 1766.86] - - [256, 704, 1, 3328] - - [679, 4296.56] + - [741, 4296.56] - - [704, 3584, 1, 256] - - [678, 7441.61] + - [740, 7441.61] - - [704, 2944, 1, 3328] - - [695, 7195.81] + - [757, 7195.81] - - [6784, 1024, 1, 128] - - [666, 4509.18] + - [728, 4509.18] - - [256, 448, 1, 128] - - [591, 838.003] + - [653, 838.003] - - [448, 1024, 1, 3328] - - [689, 6515.65] + - [751, 6515.65] - - [2944, 1024, 1, 3328] - - [684, 8751.63] + - [746, 8751.63] - - [2944, 5056, 1, 128] - - [661, 4799.73] + - [723, 4799.73] - - [2368, 256, 1, 256] - - [678, 4754.67] + - [740, 4754.67] - - [1408, 6784, 1, 256] - - [706, 7477.09] + - [768, 7477.09] - - [6784, 1408, 1, 3328] - - [684, 8968.57] + - [746, 8968.57] - - [4288, 6784, 1, 128] - - [659, 4455.74] + - [721, 4455.74] - - [1408, 2944, 1, 128] - - [671, 3862.79] + - [733, 3862.79] - - [704, 64, 1, 256] - - [610, 1441.89] + - [672, 1441.89] - - [3072, 4, 1, 1024] - - [628, 711.803] + - [690, 711.803] - - [256, 2368, 1, 3328] - - [703, 5199.73] + - [765, 5199.73] - - [6784, 2944, 1, 1280] - - [687, 8914.45] + - [749, 8914.45] - - [4288, 1856, 1, 128] - - [667, 4683.3] + - [729, 4683.3] - - [1856, 2944, 1, 128] - - [661, 4589.34] + - [723, 4589.34] - - [6784, 448, 1, 128] - - [661, 3918.53] + - [723, 3918.53] - - [64, 3584, 1, 128] - - [592, 1468.11] + - [654, 1468.11] - - [448, 5056, 1, 1280] - - [684, 7561.4] + - [746, 7561.4] - - [4288, 5056, 1, 1280] - - [676, 9304.11] + - [738, 9304.11] - - [2368, 1856, 1, 128] - - [666, 4322.17] + - [728, 4322.17] - - [128, 448, 1, 1280] - - [646, 3336.48] + - [708, 3336.48] - - [4288, 704, 1, 256] - - [689, 7834.65] + - [751, 7834.65] - - [256, 3584, 1, 128] - - [662, 2500.96] + - [724, 2500.96] - - [5888, 704, 1, 256] - - [708, 7244.49] + - [770, 7244.49] - - [3584, 1024, 1, 128] - - [673, 3169.03] + - [735, 3169.03] - - [256, 5888, 1, 3328] - - [694, 7763.47] + - [756, 7763.47] - - [1408, 4288, 1, 3328] - - [676, 9273.8] + - [738, 9273.8] - - [6784, 4288, 1, 256] - - [684, 8825.2] + - [746, 8825.2] - - [4288, 256, 1, 128] - - [663, 2621.54] + - [725, 2621.54] - - [448, 1856, 1, 3328] - - [704, 5859.8] + - [766, 5859.8] - - [5888, 256, 1, 256] - - [694, 7124.84] + - [756, 7124.84] - - [1024, 4, 1, 500000] - - [559, 1030.2] + - [621, 1030.2] - - [6784, 1024, 1, 1280] - - [676, 9083.11] + - [738, 9083.11] - - [5888, 1024, 1, 128] - - [663, 4297.16] + - [725, 4297.16] - - [1024, 128, 1, 256] - - [679, 2086.82] + - [741, 2086.82] - - [512, 16, 1, 500000] - - [560, 3921.96] + - [622, 3921.96] - - [128, 64, 1, 3328] - - [650, 1969.97] + - [712, 1969.97] - - [448, 64, 1, 256] - - [635, 1092.37] + - [697, 1092.37] - - [2368, 256, 1, 128] - - [666, 2174.84] + - [728, 2174.84] - - [6784, 3584, 1, 1280] - - [676, 9558.82] + - [738, 9558.82] - - [1024, 6784, 1, 1280] - - [685, 8637.72] + - [747, 8637.72] - - [2944, 64, 1, 1280] - - [607, 4770.13] + - [669, 4770.13] - - [1408, 2944, 1, 1280] - - [676, 9238.47] + - [738, 9238.47] - - [256, 1856, 1, 256] - - [702, 4498.43] + - [764, 4498.43] - - [1408, 2368, 1, 3328] - - [684, 8344.97] + - [746, 8344.97] - - [2944, 4, 1, 3328] - - [719, 661.209] + - [781, 661.209] - - [128, 1408, 1, 3328] - - [647, 5641.42] + - [709, 5641.42] - - [2944, 1856, 1, 128] - - [661, 4488.04] + - [723, 4488.04] - - [256, 2944, 1, 128] - - [671, 2233.18] + - [733, 2233.18] - - [256, 6784, 1, 128] - - [660, 3139.9] + - [722, 3139.9] - - [2368, 4, 1, 128] - - [715, 38.7612] + - [777, 38.7612] - - [1408, 256, 1, 3328] - - [711, 4927.67] + - [773, 4927.67] - - [1856, 4, 1, 128] - - [715, 42.3719] + - [777, 42.3719] - - [1024, 16, 1, 512] - - [627, 1115.61] + - [689, 1115.61] - - [5056, 6784, 1, 128] - - [662, 4963.45] + - [724, 4963.45] - - [4288, 5056, 1, 128] - - [660, 4928.09] + - [722, 4928.09] - - [1856, 5888, 1, 128] - - [667, 4865.15] + - [729, 4865.15] - - [7680, 2, 1, 2560] - - [603, 499.612] + - [665, 499.612] - - [3584, 1856, 1, 256] - - [693, 7978.38] + - [755, 7978.38] - - [4288, 3584, 1, 1280] - - [693, 7852.26] + - [755, 7852.26] - - [2368, 448, 1, 256] - - [708, 5238.93] + - [770, 5238.93] - - [4288, 256, 1, 3328] - - [679, 6751.34] + - [741, 6751.34] - - [1856, 704, 1, 128] - - [661, 3525.56] + - [723, 3525.56] - - [1408, 64, 1, 256] - - [620, 1884.8] + - [682, 1884.8] - - [64, 1856, 1, 128] - - [597, 888.205] + - [659, 888.205] - - [4, 256, 1, 128] - - [714, 7.38178] + - [776, 7.38178] - - [512, 16, 1, 512] - - [627, 663.756] + - [689, 663.756] - - [704, 5888, 1, 128] - - [661, 4424.55] + - [723, 4424.55] - - [6784, 3584, 1, 128] - - [663, 3823.4] + - [725, 3823.4] - - [1024, 64, 1, 256] - - [605, 1379.81] + - [667, 1379.81] - - [64, 2368, 1, 256] - - [679, 2424.93] + - [741, 2424.93] - - [5124, 1500, 1, 2048] - - [697, 8391.84] + - [759, 8391.84] - - [4288, 5056, 1, 3328] - - [683, 9274.14] + - [745, 9274.14] - - [4, 1856, 1, 1280] - - [627, 453.474] + - [689, 453.474] - - [4288, 128, 1, 128] - - [661, 2157.8] + - [723, 2157.8] - - [512, 2, 1, 500000] - - [571, 516.895] + - [633, 516.895] - - [1408, 1408, 1, 128] - - [662, 3600.49] + - [724, 3600.49] - - [7680, 16, 1, 2560] - - [642, 3542.59] + - [704, 3542.59] - - [1856, 128, 1, 128] - - [594, 1532.8] + - [656, 1532.8] - - [5056, 2368, 1, 256] - - [706, 7684.07] + - [768, 7684.07] - - [4288, 704, 1, 3328] - - [679, 7642.96] + - [741, 7642.96] - - [448, 3584, 1, 256] - - [689, 6734.07] + - [751, 6734.07] - - [2368, 64, 1, 1280] - - [640, 3962.24] + - [702, 3962.24] - - [2368, 1024, 1, 1280] - - [691, 7989.64] + - [753, 7989.64] - - [2944, 1408, 1, 3328] - - [694, 8954.66] + - [756, 8954.66] - - [6144, 1500, 1, 2560] - - [712, 8170.07] + - [774, 8170.07] - - [4224, 1, 1, 128] - - [643, 76.9] + - [705, 76.9] - - [1024, 1408, 1, 3328] - - [709, 6961.38] + - [771, 6961.38] - - [2944, 5888, 1, 1280] - - [690, 8797.53] + - [752, 8797.53] - - [8448, 2, 1, 2816] - - [565, 496.958] + - [627, 496.958] - - [1408, 4, 1, 1280] - - [720, 471.891] + - [782, 471.891] - - [5888, 3584, 1, 256] - - [697, 8246.3] + - [759, 8246.3] - - [2368, 5056, 1, 128] - - [660, 4906.9] + - [722, 4906.9] - - [1408, 1856, 1, 3328] - - [684, 9006.8] + - [746, 9006.8] - - [4, 4, 1, 3328] - - [632, 5.83793] + - [694, 5.83793] - - [5888, 5056, 1, 3328] - - [697, 8545.1] + - [759, 8545.1] - - [7680, 6000, 1, 2560] - - [690, 7996.0] + - [752, 7996.0] - - [6784, 1408, 1, 1280] - - [684, 8888.13] + - [746, 8888.13] - - [4, 1024, 1, 1280] - - [632, 302.109] + - [694, 302.109] - - [512, 3000, 1, 2560] - - [684, 7809.43] + - [746, 7809.43] - - [704, 2944, 1, 256] - - [689, 4909.24] + - [751, 4909.24] - - [4288, 64, 1, 256] - - [689, 3264.72] + - [751, 3264.72] - - [6784, 5888, 1, 3328] - - [697, 9544.52] + - [759, 9544.52] - - [2368, 4288, 1, 128] - - [660, 4873.03] + - [722, 4873.03] - - [64, 4288, 1, 1280] - - [646, 4656.42] + - [708, 4656.42] - - [6784, 64, 1, 1280] - - [679, 6230.43] + - [741, 6230.43] - - [3584, 128, 1, 128] - - [587, 2315.57] + - [649, 2315.57] - - [1024, 6784, 1, 128] - - [661, 3758.94] + - [723, 3758.94] - - [1024, 1500, 1, 1536] - - [710, 6972.0] + - [772, 6972.0] - - [1408, 64, 1, 3328] - - [613, 5079.58] + - [675, 5079.58] - - [6784, 4, 1, 256] - - [599, 487.938] + - [661, 487.938] - - [1408, 1408, 1, 1280] - - [712, 7423.31] + - [774, 7423.31] - - [256, 2368, 1, 256] - - [679, 4986.9] + - [741, 4986.9] - - [3072, 3000, 1, 1024] - - [681, 7844.01] + - [743, 7844.01] - - [448, 4288, 1, 3328] - - [680, 7204.79] + - [742, 7204.79] - - [2368, 1408, 1, 256] - - [712, 5897.96] + - [774, 5897.96] - - [704, 2368, 1, 256] - - [679, 7000.93] + - [741, 7000.93] - - [1024, 24000, 1, 2560] - - [706, 8562.31] + - [768, 8562.31] - - [2944, 448, 1, 1280] - - [694, 7155.93] + - [756, 7155.93] - - [5888, 2368, 1, 3328] - - [693, 9252.42] + - [755, 9252.42] - - [1024, 256, 1, 128] - - [675, 1255.88] + - [737, 1255.88] - - [5124, 9124, 1, 1760] - - [687, 9168.49] + - [749, 9168.49] - - [448, 1408, 1, 1280] - - [679, 6150.34] + - [741, 6150.34] - - [448, 1856, 1, 1280] - - [694, 6489.76] + - [756, 6489.76] - - [4288, 448, 1, 1280] - - [709, 6887.02] + - [771, 6887.02] - - [5888, 704, 1, 3328] - - [689, 8230.64] + - [751, 8230.64] - - [4, 1856, 1, 128] - - [715, 27.0964] + - [777, 27.0964] - - [5056, 256, 1, 128] - - [660, 3469.01] + - [722, 3469.01] - - [1856, 256, 1, 128] - - [661, 2534.16] + - [723, 2534.16] - - [128, 2368, 1, 256] - - [679, 3660.22] + - [741, 3660.22] - - [704, 4, 1, 256] - - [627, 134.596] + - [689, 134.596] - - [1024, 6784, 1, 3328] - - [681, 8482.75] + - [743, 8482.75] - - [1408, 5888, 1, 128] - - [661, 4644.52] + - [723, 4644.52] - - [4288, 4, 1, 128] - - [714, 35.8799] + - [776, 35.8799] - - [512, 3136, 1, 2048] - - [726, 6386.69] + - [788, 6386.69] - - [1408, 1024, 1, 256] - - [679, 5440.82] + - [741, 5440.82] - - [128, 64, 1, 256] - - [609, 380.019] + - [671, 380.019] - - [8448, 1500, 1, 2816] - - [676, 9155.92] + - [738, 9155.92] - - [256, 704, 1, 128] - - [661, 895.623] + - [723, 895.623] - - [2560, 7000, 1, 2560] - - [688, 8565.66] + - [750, 8565.66] - - [5888, 64, 1, 1280] - - [703, 5007.83] + - [765, 5007.83] - - [128, 4, 1, 3328] - - [722, 165.21] + - [784, 165.21] - - [5056, 6784, 1, 1280] - - [687, 9331.48] + - [749, 9331.48] - - [1024, 448, 1, 1280] - - [689, 6501.46] + - [751, 6501.46] - - [704, 5056, 1, 3328] - - [676, 8090.13] + - [738, 8090.13] - - [128, 5056, 1, 256] - - [689, 5537.37] + - [751, 5537.37] - - [3584, 5056, 1, 3328] - - [685, 8633.24] + - [747, 8633.24] - - [1856, 4, 1, 3328] - - [723, 582.814] + - [785, 582.814] - - [4, 2944, 1, 128] - - [714, 114.292] + - [776, 114.292] - - [2368, 2944, 1, 3328] - - [693, 8749.55] + - [755, 8749.55] - - [448, 448, 1, 1280] - - [617, 4694.93] + - [679, 4694.93] - - [128, 4, 1, 128] - - [714, 4.94734] + - [776, 4.94734] - - [2368, 3584, 1, 256] - - [693, 8418.59] + - [755, 8418.59] - - [4608, 3000, 1, 1536] - - [683, 9076.47] + - [745, 9076.47] - - [1024, 256, 1, 1280] - - [689, 5562.84] + - [751, 5562.84] - - [5056, 3584, 1, 1280] - - [683, 8365.09] + - [745, 8365.09] - - [5124, 9124, 1, 4096] - - [693, 8648.58] + - [755, 8648.58] - - [7680, 48000, 1, 2560] - - [687, 4098.26] + - [749, 4098.26] - - [1856, 704, 1, 1280] - - [679, 8141.04] + - [741, 8141.04] - - [1856, 2944, 1, 1280] - - [681, 8214.4] + - [743, 8214.4] - - [4608, 1500, 1, 1536] - - [689, 8424.53] + - [751, 8424.53] - - [1024, 48000, 1, 2816] - - [680, 8513.18] + - [742, 8513.18] - - [5124, 9124, 1, 2560] - - [697, 8641.24] + - [759, 8641.24] - - [128, 1024, 1, 256] - - [611, 2356.45] + - [673, 2356.45] - - [2944, 1408, 1, 256] - - [693, 8254.29] + - [755, 8254.29] - - [4288, 1408, 1, 3328] - - [687, 9138.49] + - [749, 9138.49] - - [3584, 64, 1, 3328] - - [600, 5629.62] + - [662, 5629.62] - - [5888, 2944, 1, 128] - - [661, 4119.33] + - [723, 4119.33] - - [2944, 1024, 1, 128] - - [663, 4002.96] + - [725, 4002.96] - - [128, 1, 1, 1024] - - [657, 20.0805] + - [719, 20.0805] - - [5124, 700, 1, 2048] - - [694, 7653.84] + - [756, 7653.84] - - [4, 4288, 1, 1280] - - [627, 587.749] + - [689, 587.749] - - [6784, 5056, 1, 128] - - [666, 4855.85] + - [728, 4855.85] - - [256, 1024, 1, 3328] - - [689, 6116.28] + - [751, 6116.28] - - [3584, 4, 1, 256] - - [601, 395.576] + - [663, 395.576] - - [1856, 64, 1, 3328] - - [616, 5732.6] + - [678, 5732.6] - - [4, 128, 1, 3328] - - [722, 162.689] + - [784, 162.689] - - [256, 12544, 1, 1024] - - [726, 7628.92] + - [788, 7628.92] - - [5888, 1408, 1, 3328] - - [687, 9524.43] + - [749, 9524.43] - - [448, 2944, 1, 128] - - [661, 3163.91] + - [723, 3163.91] - - [2368, 1856, 1, 256] - - [689, 8167.36] + - [751, 8167.36] - - [256, 5056, 1, 256] - - [679, 7292.13] + - [741, 7292.13] - - [5056, 5056, 1, 128] - - [667, 5043.99] + - [729, 5043.99] - - [448, 3584, 1, 3328] - - [684, 6839.56] + - [746, 6839.56] - - [4, 5056, 1, 3328] - - [632, 639.886] + - [694, 639.886] - - [256, 256, 1, 128] - - [591, 554.902] + - [653, 554.902] - - [5888, 256, 1, 128] - - [663, 3562.47] + - [725, 3562.47] - - [4, 5056, 1, 128] - - [714, 149.907] + - [776, 149.907] - - [448, 256, 1, 256] - - [610, 2121.5] + - [672, 2121.5] - - [704, 4, 1, 3328] - - [720, 455.919] + - [782, 455.919] - - [1408, 256, 1, 256] - - [679, 4352.68] + - [741, 4352.68] - - [3584, 1856, 1, 128] - - [670, 3933.23] + - [732, 3933.23] - - [4288, 4288, 1, 128] - - [661, 4888.61] + - [723, 4888.61] - - [1856, 1024, 1, 3328] - - [697, 8242.64] + - [759, 8242.64] - - [1856, 4288, 1, 128] - - [666, 4647.4] + - [728, 4647.4] - - [1024, 6000, 1, 2560] - - [691, 8526.75] + - [753, 8526.75] - - [1024, 5056, 1, 256] - - [676, 7343.83] + - [738, 7343.83] - - [5056, 5888, 1, 128] - - [665, 4053.5] + - [727, 4053.5] - - [2368, 1408, 1, 3328] - - [679, 8466.2] + - [741, 8466.2] - - [1024, 48000, 1, 1536] - - [697, 9487.74] + - [759, 9487.74] - - [5888, 448, 1, 256] - - [710, 6081.54] + - [772, 6081.54] - - [5888, 6784, 1, 128] - - [662, 4820.27] + - [724, 4820.27] - - [2368, 4, 1, 3328] - - [721, 620.628] + - [783, 620.628] - - [6784, 5056, 1, 1280] - - [706, 8525.5] + - [768, 8525.5] - - [5056, 704, 1, 1280] - - [676, 7933.06] + - [738, 7933.06] - - [1024, 48000, 1, 2560] - - [697, 8877.94] + - [759, 8877.94] - - [4608, 32, 1, 1536] - - [626, 3556.83] + - [688, 3556.83] - - [1024, 2368, 1, 128] - - [669, 2943.75] + - [731, 2943.75] - - [128, 704, 1, 256] - - [610, 2059.8] + - [672, 2059.8] - - [2368, 448, 1, 3328] - - [689, 5290.42] + - [751, 5290.42] - - [128, 5888, 1, 3328] - - [689, 7764.43] + - [751, 7764.43] - - [448, 128, 1, 1280] - - [640, 3373.28] + - [702, 3373.28] - - [6784, 4, 1, 3328] - - [599, 676.063] + - [661, 676.063] - - [4288, 4, 1, 1280] - - [632, 564.775] + - [694, 564.775] - - [1024, 64, 1, 3328] - - [646, 4293.48] + - [708, 4293.48] - - [3072, 48000, 1, 1024] - - [696, 7826.51] + - [758, 7826.51] - - [256, 4, 1, 128] - - [715, 4.93304] + - [777, 4.93304] - - [1024, 5888, 1, 128] - - [674, 3610.46] + - [736, 3610.46] - - [3584, 5888, 1, 128] - - [662, 4722.35] + - [724, 4722.35] - - [5056, 5888, 1, 256] - - [697, 9159.11] + - [759, 9159.11] - - [2368, 1024, 1, 256] - - [689, 7482.71] + - [751, 7482.71] - - [2944, 1856, 1, 256] - - [693, 8209.0] + - [755, 8209.0] - - [1856, 6784, 1, 1280] - - [689, 8205.43] + - [751, 8205.43] - - [64, 5056, 1, 128] - - [584, 2079.35] + - [646, 2079.35] - - [64, 6784, 1, 128] - - [584, 2437.58] + - [646, 2437.58] - - [448, 704, 1, 128] - - [660, 1506.45] + - [722, 1506.45] - - [4, 1024, 1, 128] - - [715, 17.3463] + - [777, 17.3463] - - [1408, 448, 1, 256] - - [679, 5545.45] + - [741, 5545.45] - - [1408, 704, 1, 128] - - [665, 2931.65] + - [727, 2931.65] - - [64, 256, 1, 3328] - - [651, 2816.52] + - [713, 2816.52] - - [8448, 3000, 1, 2816] - - [685, 8872.99] + - [747, 8872.99] - - [6784, 448, 1, 3328] - - [679, 7555.48] + - [741, 7555.48] - - [5056, 1856, 1, 1280] - - [677, 8652.36] + - [739, 8652.36] - - [1408, 1024, 1, 3328] - - [681, 7781.42] + - [743, 7781.42] - - [2368, 256, 1, 3328] - - [685, 5392.06] + - [747, 5392.06] - - [7680, 1500, 1, 2560] - - [683, 8919.72] + - [745, 8919.72] - - [5888, 3584, 1, 1280] - - [683, 9235.85] + - [745, 9235.85] - - [1856, 3584, 1, 3328] - - [694, 8348.83] + - [756, 8348.83] - - [5888, 128, 1, 1280] - - [679, 5928.61] + - [741, 5928.61] - - [1024, 2944, 1, 256] - - [710, 6630.27] + - [772, 6630.27] - - [448, 6784, 1, 1280] - - [691, 8332.45] + - [753, 8332.45] - - [256, 3584, 1, 1280] - - [681, 7140.19] + - [743, 7140.19] - - [448, 128, 1, 128] - - [583, 552.813] + - [645, 552.813] - - [704, 5056, 1, 256] - - [689, 7959.68] + - [751, 7959.68] - - [3584, 1024, 1, 3328] - - [681, 8386.84] + - [743, 8386.84] - - [2944, 1856, 1, 1280] - - [697, 7670.29] + - [759, 7670.29] - - [128, 256, 1, 128] - - [598, 258.37] + - [660, 258.37] - - [5056, 256, 1, 256] - - [689, 5736.77] + - [751, 5736.77] - - [2944, 4288, 1, 3328] - - [676, 8730.8] + - [738, 8730.8] - - [2368, 3584, 1, 3328] - - [678, 8437.71] + - [740, 8437.71] - - [2944, 704, 1, 1280] - - [689, 8342.53] + - [751, 8342.53] - - [128, 4, 1, 256] - - [609, 24.9242] + - [671, 24.9242] - - [2944, 3584, 1, 1280] - - [691, 8322.11] + - [753, 8322.11] - - [1856, 5888, 1, 1280] - - [676, 8911.91] + - [738, 8911.91] - - [256, 256, 1, 1280] - - [640, 3653.67] + - [702, 3653.67] - - [4608, 24000, 1, 1536] - - [690, 8931.06] + - [752, 8931.06] - - [4288, 1408, 1, 256] - - [677, 8338.45] + - [739, 8338.45] - - [3584, 64, 1, 256] - - [689, 3414.07] + - [751, 3414.07] - - [64, 1856, 1, 3328] - - [616, 5460.23] + - [678, 5460.23] - - [256, 1408, 1, 128] - - [660, 1424.09] + - [722, 1424.09] - - [5888, 1408, 1, 128] - - [671, 4177.88] + - [733, 4177.88] - - [4288, 2368, 1, 1280] - - [680, 8596.05] + - [742, 8596.05] - - [4, 4288, 1, 256] - - [716, 370.954] + - [778, 370.954] - - [256, 4288, 1, 128] - - [661, 2907.99] + - [723, 2907.99] - - [256, 128, 1, 3328] - - [654, 3644.88] + - [716, 3644.88] - - [512, 8, 1, 500000] - - [566, 2025.89] + - [628, 2025.89] - - [6784, 2368, 1, 256] - - [679, 8470.41] + - [741, 8470.41] - - [5888, 128, 1, 128] - - [584, 2604.55] + - [646, 2604.55] - - [1408, 448, 1, 3328] - - [689, 6540.62] + - [751, 6540.62] - - [1024, 24000, 1, 2816] - - [706, 8364.03] + - [768, 8364.03] - - [704, 1024, 1, 1280] - - [689, 7277.28] + - [751, 7277.28] - - [1856, 256, 1, 3328] - - [679, 7039.14] + - [741, 7039.14] - - [1856, 2944, 1, 256] - - [688, 8151.59] + - [750, 8151.59] - - [5056, 1024, 1, 128] - - [662, 4422.82] + - [724, 4422.82] - - [64, 5888, 1, 1280] - - [640, 4854.62] + - [702, 4854.62] - - [7680, 3000, 1, 2560] - - [693, 8789.57] + - [755, 8789.57] - - [4224, 1500, 1, 176] - - [689, 7902.14] + - [751, 7902.14] - - [5124, 700, 1, 2560] - - [679, 8232.59] + - [741, 8232.59] - - [6784, 256, 1, 128] - - [660, 3548.92] + - [722, 3548.92] - - [5888, 704, 1, 128] - - [667, 3959.65] + - [729, 3959.65] - - [6784, 64, 1, 128] - - [595, 2150.82] + - [657, 2150.82] - - [4, 448, 1, 1280] - - [720, 268.063] + - [782, 268.063] - - [1024, 4288, 1, 1280] - - [694, 8363.72] + - [756, 8363.72] - - [2368, 5056, 1, 3328] - - [693, 8581.85] + - [755, 8581.85] - - [448, 4, 1, 128] - - [714, 16.8673] + - [776, 16.8673] - - [4, 256, 1, 3328] - - [723, 201.988] + - [785, 201.988] - - [4288, 1024, 1, 3328] - - [689, 8567.72] + - [751, 8567.72] - - [6144, 48000, 1, 2560] - - [697, 3751.68] + - [759, 3751.68] - - [1024, 5056, 1, 3328] - - [676, 9440.66] + - [738, 9440.66] - - [1024, 1856, 1, 3328] - - [697, 8244.36] + - [759, 8244.36] - - [704, 704, 1, 1280] - - [689, 5529.99] + - [751, 5529.99] - - [128, 2368, 1, 1280] - - [646, 5062.38] + - [708, 5062.38] - - [3584, 4, 1, 128] - - [715, 61.5949] + - [777, 61.5949] - - [3584, 256, 1, 1280] - - [713, 6260.24] + - [775, 6260.24] - - [4, 128, 1, 128] - - [714, 1.2587] + - [776, 1.2587] - - [128, 4288, 1, 3328] - - [625, 6186.15] + - [687, 6186.15] - - [5124, 1500, 1, 2560] - - [693, 8432.62] + - [755, 8432.62] - - [3584, 128, 1, 1280] - - [679, 6547.85] + - [741, 6547.85] - - [4, 256, 1, 1280] - - [632, 180.144] + - [694, 180.144] - - [128, 704, 1, 3328] - - [604, 5177.81] + - [666, 5177.81] - - [4288, 6784, 1, 256] - - [677, 9005.34] + - [739, 9005.34] - - [3584, 2944, 1, 3328] - - [694, 8872.27] + - [756, 8872.27] - - [128, 1856, 1, 256] - - [679, 3690.48] + - [741, 3690.48] - - [64, 4288, 1, 256] - - [679, 3007.57] + - [741, 3007.57] - - [4, 3584, 1, 3328] - - [609, 639.99] + - [671, 639.99] - - [64, 4, 1, 3328] - - [723, 98.7074] + - [785, 98.7074] - - [4, 64, 1, 3328] - - [723, 91.9069] + - [785, 91.9069] - - [35, 700, 1, 2560] - - [577, 2397.65] + - [639, 2397.65] - - [5888, 2944, 1, 256] - - [687, 9031.28] + - [749, 9031.28] - - [4, 2368, 1, 256] - - [627, 256.968] + - [689, 256.968] - - [1856, 64, 1, 256] - - [611, 2222.96] + - [673, 2222.96] - - [5056, 128, 1, 1280] - - [679, 6557.85] + - [741, 6557.85] - - [448, 4288, 1, 1280] - - [703, 6891.66] + - [765, 6891.66] - - [256, 4288, 1, 256] - - [679, 6250.51] + - [741, 6250.51] - - [1024, 4288, 1, 128] - - [663, 3951.41] + - [725, 3951.41] - - [4, 1024, 1, 256] - - [627, 182.144] + - [689, 182.144] - - [5056, 4288, 1, 256] - - [683, 8933.43] + - [745, 8933.43] - - [1024, 448, 1, 256] - - [689, 4573.33] + - [751, 4573.33] - - [1024, 3584, 1, 256] - - [684, 7447.18] + - [746, 7447.18] - - [2944, 128, 1, 1280] - - [689, 5417.27] + - [751, 5417.27] - - [49, 2048, 64, 512] - - [732, 5916.91] + - [794, 5916.91] - - [2560, 32, 1, 2560] - - [626, 4076.99] + - [688, 4076.99] - - [64, 256, 1, 256] - - [643, 689.953] + - [705, 689.953] - - [1024, 4, 1, 512] - - [635, 288.17] + - [697, 288.17] - - [128, 2368, 1, 128] - - [589, 1809.68] + - [651, 1809.68] - - [256, 704, 1, 1280] - - [679, 4033.08] + - [741, 4033.08] - - [64, 2368, 1, 128] - - [580, 1165.88] + - [642, 1165.88] - - [176, 1500, 1, 1408] - - [607, 4922.13] + - [669, 4922.13] - - [448, 5888, 1, 1280] - - [689, 7550.21] + - [751, 7550.21] - - [512, 3000, 1, 2048] - - [711, 6562.44] + - [773, 6562.44] - - [5056, 448, 1, 128] - - [661, 3947.97] + - [723, 3947.97] - - [4288, 704, 1, 1280] - - [679, 8243.82] + - [741, 8243.82] - - [3584, 2944, 1, 128] - - [671, 4284.88] + - [733, 4284.88] - - [6784, 256, 1, 1280] - - [679, 7955.21] + - [741, 7955.21] - - [256, 2944, 1, 1280] - - [709, 6691.9] + - [771, 6691.9] - - [2560, 128, 1, 2560] - - [647, 5347.23] + - [709, 5347.23] - - [2368, 5888, 1, 3328] - - [684, 8919.07] + - [746, 8919.07] - - [4, 64, 1, 256] - - [632, 13.1032] + - [694, 13.1032] - - [704, 1024, 1, 3328] - - [709, 6648.12] + - [771, 6648.12] - - [2368, 1856, 1, 1280] - - [695, 8016.51] + - [757, 8016.51] - - [448, 5056, 1, 3328] - - [679, 8231.73] + - [741, 8231.73] - - [128, 448, 1, 128] - - [588, 441.208] + - [650, 441.208] - - [128, 6784, 1, 256] - - [689, 5850.05] + - [751, 5850.05] - - [512, 4, 1, 500000] - - [569, 1027.14] + - [631, 1027.14] - - [3584, 4288, 1, 128] - - [665, 4260.9] + - [727, 4260.9] - - [64, 448, 1, 128] - - [588, 253.554] + - [650, 253.554] - - [1024, 6000, 1, 2816] - - [693, 8886.14] + - [755, 8886.14] - - [5888, 4288, 1, 3328] - - [693, 8968.16] + - [755, 8968.16] - - [2368, 704, 1, 256] - - [709, 4663.24] + - [771, 4663.24] - - [256, 1856, 1, 3328] - - [681, 6480.63] + - [743, 6480.63] - - [1856, 128, 1, 256] - - [679, 3726.66] + - [741, 3726.66] - - [6784, 128, 1, 128] - - [582, 2824.01] + - [644, 2824.01] - - [3584, 1408, 1, 128] - - [665, 3666.78] + - [727, 3666.78] - - [1856, 5056, 1, 1280] - - [676, 8651.36] + - [738, 8651.36] - - [2944, 1024, 1, 1280] - - [687, 8765.21] + - [749, 8765.21] - - [5056, 4, 1, 256] - - [601, 428.688] + - [663, 428.688] - - [3584, 5888, 1, 3328] - - [687, 9347.75] + - [749, 9347.75] - - [2368, 4288, 1, 256] - - [697, 8013.1] + - [759, 8013.1] - - [1024, 2368, 1, 3328] - - [684, 8119.29] + - [746, 8119.29] - - [128, 3584, 1, 128] - - [584, 2584.62] + - [646, 2584.62] - - [704, 1408, 1, 256] - - [689, 6792.27] + - [751, 6792.27] - - [4096, 128, 1, 4096] - - [711, 6624.84] + - [773, 6624.84] - - [1024, 2944, 1, 128] - - [663, 3771.37] + - [725, 3771.37] - - [1024, 3584, 1, 1280] - - [684, 8952.71] + - [746, 8952.71] - - [4288, 5888, 1, 3328] - - [697, 9048.05] + - [759, 9048.05] - - [4288, 4, 1, 3328] - - [602, 615.206] + - [664, 615.206] - - [4608, 16, 1, 1536] - - [606, 2894.94] + - [668, 2894.94] - - [5888, 64, 1, 128] - - [593, 1827.16] + - [655, 1827.16] - - [4, 5888, 1, 128] - - [714, 179.544] + - [776, 179.544] - - [1024, 2944, 1, 3328] - - [685, 8298.77] + - [747, 8298.77] - - [2048, 64, 1, 2048] - - [614, 4963.77] + - [676, 4963.77] - - [6144, 2, 1, 2560] - - [603, 477.88] + - [665, 477.88] - - [256, 6784, 1, 1280] - - [677, 7491.94] + - [739, 7491.94] - - [1856, 3584, 1, 256] - - [689, 7580.6] + - [751, 7580.6] - - [128, 448, 1, 3328] - - [640, 4417.71] + - [702, 4417.71] - - [6784, 1856, 1, 128] - - [668, 4621.74] + - [730, 4621.74] - - [1024, 1500, 1, 2048] - - [689, 6284.5] + - [751, 6284.5] - - [5056, 128, 1, 256] - - [689, 5705.16] + - [751, 5705.16] - - [512, 24000, 1, 2816] - - [676, 8919.85] + - [738, 8919.85] - - [256, 5888, 1, 1280] - - [691, 7978.0] + - [753, 7978.0] - - [4, 128, 1, 1280] - - [632, 94.2609] + - [694, 94.2609] - - [4288, 6784, 1, 3328] - - [697, 9012.58] + - [759, 9012.58] - - [6784, 128, 1, 1280] - - [681, 6807.35] + - [743, 6807.35] - - [64, 1408, 1, 256] - - [610, 2045.19] + - [672, 2045.19] - - [2368, 1408, 1, 128] - - [661, 4340.73] + - [723, 4340.73] - - [1856, 448, 1, 256] - - [710, 3639.99] + - [772, 3639.99] - - [1408, 1024, 1, 128] - - [669, 3417.68] + - [731, 3417.68] - - [128, 64, 1, 128] - - [590, 68.7241] + - [652, 68.7241] - - [6784, 3584, 1, 3328] - - [687, 9425.63] + - [749, 9425.63] - - [1760, 7000, 1, 1760] - - [684, 8780.41] + - [746, 8780.41] - - [1024, 704, 1, 3328] - - [701, 5644.6] + - [763, 5644.6] - - [64, 64, 1, 128] - - [580, 38.2023] + - [642, 38.2023] - - [2368, 5056, 1, 1280] - - [698, 8462.41] + - [760, 8462.41] - - [64, 4, 1, 1280] - - [632, 46.6455] + - [694, 46.6455] - - [1408, 2368, 1, 1280] - - [684, 8235.08] + - [746, 8235.08] - - [128, 1408, 1, 1280] - - [646, 4491.66] + - [708, 4491.66] - - [1024, 1, 1, 512] - - [650, 82.02] + - [712, 82.02] - - [4, 1408, 1, 128] - - [714, 56.42] + - [776, 56.42] - - [704, 4288, 1, 128] - - [668, 3942.96] + - [730, 3942.96] - - [128, 1856, 1, 3328] - - [634, 6111.93] + - [696, 6111.93] - - [2944, 2944, 1, 256] - - [693, 8640.22] + - [755, 8640.22] - - [2944, 4, 1, 1280] - - [627, 554.265] + - [689, 554.265] - - [5888, 4, 1, 256] - - [609, 435.744] + - [671, 435.744] - - [6784, 256, 1, 256] - - [689, 7025.96] + - [751, 7025.96] - - [256, 5056, 1, 3328] - - [689, 8249.57] + - [751, 8249.57] - - [128, 4288, 1, 1280] - - [679, 5561.74] + - [741, 5561.74] - - [5056, 1856, 1, 128] - - [673, 3975.28] + - [735, 3975.28] - - [1024, 3000, 1, 1536] - - [694, 8544.54] + - [756, 8544.54] - - [5056, 1024, 1, 3328] - - [687, 9361.47] + - [749, 9361.47] - - [128, 128, 1, 256] - - [639, 699.151] + - [701, 699.151] - - [1760, 64, 1, 1760] - - [607, 4956.26] + - [669, 4956.26] - - [4288, 3584, 1, 3328] - - [707, 7506.18] + - [769, 7506.18] - - [448, 704, 1, 3328] - - [679, 4697.66] + - [741, 4697.66] - - [448, 448, 1, 128] - - [596, 1249.62] + - [658, 1249.62] - - [1024, 2368, 1, 1280] - - [689, 7756.44] + - [751, 7756.44] - - [1856, 704, 1, 3328] - - [689, 8340.66] + - [751, 8340.66] - - [512, 1500, 1, 2560] - - [691, 6041.39] + - [753, 6041.39] - - [5888, 6784, 1, 3328] - - [687, 9199.38] + - [749, 9199.38] - - [704, 4288, 1, 1280] - - [681, 8342.06] + - [743, 8342.06] - - [128, 50176, 1, 512] - - [727, 7589.48] + - [789, 7589.48] - - [704, 256, 1, 256] - - [679, 2912.81] + - [741, 2912.81] - - [1024, 48000, 1, 2048] - - [684, 8947.42] + - [746, 8947.42] - - [4288, 1024, 1, 128] - - [660, 4291.75] + - [722, 4291.75] - - [3136, 64, 128, 64] - - [742, 8175.16] + - [804, 8175.16] - - [784, 512, 64, 128] - - [740, 8378.44] + - [802, 8378.44] - - [3136, 256, 64, 64] - - [743, 8506.75] + - [805, 8506.75] - - [12544, 1024, 1, 256] - - [736, 8928.03] + - [798, 8928.03] - - [784, 128, 128, 512] - - [741, 8190.63] + - [803, 8190.63] - - [784, 512, 256, 128] - - [739, 8637.24] + - [801, 8637.24] - - [3136, 64, 64, 256] - - [738, 8783.03] + - [800, 8783.03] - - [3136, 512, 1, 2048] - - [735, 7298.42] + - [797, 7298.42] - - [12544, 256, 1, 1024] - - [747, 7667.35] + - [809, 7667.35] - - [3136, 2048, 1, 512] - - [746, 8447.32] + - [808, 8447.32] - - [3136, 256, 256, 64] - - [739, 8663.18] + - [801, 8663.18] - - [3136, 64, 128, 256] - - [737, 8943.56] + - [799, 8943.56] - - [784, 128, 64, 512] - - [745, 8006.37] + - [807, 8006.37] - - [3136, 64, 256, 64] - - [742, 8267.22] + - [804, 8267.22] - - [784, 512, 128, 128] - - [739, 8564.35] + - [801, 8564.35] - - [3136, 64, 64, 64] - - [742, 8009.45] + - [804, 8009.45] - - [784, 128, 256, 512] - - [743, 8377.16] + - [805, 8377.16] - - [3136, 64, 256, 256] - - [744, 9033.98] + - [806, 9033.98] - - [3136, 256, 128, 64] - - [739, 8624.56] + - [801, 8624.56] - - [1024, 256, 1, 1024] - - [765, 6331.13] + - [827, 6331.13] - - [1024, 512, 1, 2048] - - [764, 8100.14] + - [826, 8100.14] - - [512, 200, 1, 512] - - [773, 2861.93] + - [835, 2861.93] - - [4096, 256, 1, 2048] - - [756, 8812.82] + - [818, 8812.82] - - [4096, 512, 1, 1024] - - [766, 9068.87] + - [828, 9068.87] - - [1024, 200, 1, 1024] - - [765, 5110.12] + - [827, 5110.12] - - [1024, 512, 1, 1024] - - [758, 7785.35] + - [820, 7785.35] - - [2048, 256, 1, 4096] - - [768, 8438.81] + - [830, 8438.81] - - [2048, 768, 1, 512] - - [750, 8618.53] + - [812, 8618.53] - - [512, 256, 1, 1024] - - [770, 4835.03] + - [832, 4835.03] - - [512, 768, 1, 2048] - - [767, 6909.04] + - [829, 6909.04] - - [2048, 256, 1, 1024] - - [763, 7941.98] + - [825, 7941.98] - - [1024, 256, 1, 2048] - - [760, 6997.9] + - [822, 6997.9] - - [2048, 200, 1, 512] - - [763, 5649.76] + - [825, 5649.76] - - [4096, 200, 1, 1024] - - [761, 6678.93] + - [823, 6678.93] - - [2048, 200, 1, 4096] - - [769, 6706.69] + - [831, 6706.69] - - [2048, 512, 1, 1024] - - [766, 8549.0] + - [828, 8549.0] - - [1024, 1024, 1, 512] - - [761, 8046.73] + - [823, 8046.73] - - [1024, 200, 1, 4096] - - [760, 5884.36] + - [822, 5884.36] - - [2048, 512, 1, 4096] - - [771, 8995.94] + - [833, 8995.94] - - [4096, 512, 1, 2048] - - [766, 9298.18] - - - [512, 256, 1, 2048] - - [759, 5186.26] + - [828, 9298.18] - - [4096, 1024, 1, 2048] - - [748, 9790.77] + - [810, 9790.77] - - [2048, 1024, 1, 2048] - - [749, 9278.9] + - [811, 9278.9] - - [1024, 200, 1, 512] - - [765, 4535.46] + - [827, 4535.46] - - [1024, 1024, 1, 4096] - - [756, 8967.39] + - [818, 8967.39] - - [2048, 1024, 1, 4096] - - [751, 9500.56] + - [813, 9500.56] - - [4096, 200, 1, 2048] - - [757, 7082.68] + - [819, 7082.68] - - [2048, 200, 1, 1024] - - [763, 6212.04] + - [825, 6212.04] - - [1024, 768, 1, 512] - - [764, 7401.81] + - [826, 7401.81] - - [2048, 512, 1, 512] - - [761, 8124.66] + - [823, 8124.66] - - [2048, 200, 1, 2048] - - [763, 6561.9] + - [825, 6561.9] - - [2048, 256, 1, 2048] - - [764, 8224.23] + - [826, 8224.23] - - [512, 768, 1, 512] - - [762, 6469.46] + - [824, 6469.46] - - [512, 200, 1, 1024] - - [765, 3755.74] + - [827, 3755.74] - - [4096, 1024, 1, 1024] - - [748, 9605.95] + - [810, 9605.95] - - [4096, 256, 1, 4096] - - [771, 8961.39] + - [833, 8961.39] - - [1024, 512, 1, 512] - - [764, 7109.09] + - [826, 7109.09] - - [512, 256, 1, 512] - - [772, 4033.08] + - [834, 4033.08] - - [1024, 256, 1, 4096] - - [760, 7326.4] + - [822, 7326.4] - - [4096, 512, 1, 4096] - - [752, 9472.07] + - [814, 9472.07] - - [1024, 200, 1, 2048] - - [753, 5530.56] + - [815, 5530.56] - - [2048, 1024, 1, 512] - - [754, 8995.93] + - [816, 8995.93] - - [1024, 1024, 1, 2048] - - [761, 8830.21] + - [823, 8830.21] - - [4096, 256, 1, 1024] - - [761, 8581.8] + - [823, 8581.8] - - [512, 768, 1, 1024] - - [762, 6876.01] + - [824, 6876.01] - - [1024, 512, 1, 4096] - - [758, 8484.15] + - [820, 8484.15] - - [1024, 256, 1, 512] - - [755, 5668.08] + - [817, 5668.08] - - [4096, 200, 1, 4096] - - [768, 7018.69] + - [830, 7018.69] - - [2048, 256, 1, 512] - - [768, 7079.09] + - [830, 7079.09] - - [512, 200, 1, 2048] - - [773, 4283.5] + - [835, 4283.5] - - [1024, 1024, 1, 1024] - - [756, 8565.37] + - [818, 8565.37] - - [2048, 512, 1, 2048] - - [756, 8850.59] + - [818, 8850.59] - - [4096, 1024, 1, 4096] - - [749, 9843.28] + - [811, 9843.28] - - [2048, 1024, 1, 1024] - - [754, 9234.21] + - [816, 9234.21] - - [4096, 384, 1, 2048] - - [796, 8892.62] + - [858, 8892.62] - - [4096, 192, 1, 2048] - - [790, 8024.28] + - [852, 8024.28] - - [289, 160, 64, 768] - - [792, 6783.73] + - [854, 6783.73] - - [1225, 192, 64, 384] - - [779, 9373.93] + - [841, 9373.93] - - [5329, 64, 64, 160] - - [783, 9186.79] + - [845, 9186.79] - - [1225, 64, 64, 288] - - [774, 8492.51] + - [836, 8492.51] - - [1225, 64, 64, 384] - - [778, 8735.86] + - [840, 8735.86] - - [289, 128, 64, 1024] - - [793, 7000.3] + - [855, 7000.3] - - [4096, 320, 1, 1280] - - [798, 8302.36] + - [860, 8302.36] - - [4096, 384, 1, 1536] - - [780, 9052.55] + - [842, 9052.55] - - [4096, 192, 1, 1280] - - [795, 7561.95] + - [857, 7561.95] - - [289, 192, 64, 768] - - [791, 7882.6] + - [853, 7882.6] - - [1225, 48, 64, 256] - - [782, 6620.35] + - [844, 6620.35] - - [289, 192, 64, 1024] - - [789, 7347.09] + - [851, 7347.09] - - [1225, 64, 64, 192] - - [775, 8098.45] + - [837, 8098.45] - - [1225, 96, 64, 384] - - [776, 8303.18] + - [838, 8303.18] - - [1225, 48, 64, 288] - - [784, 6746.87] + - [846, 6746.87] - - [4096, 320, 1, 2048] - - [785, 8384.52] + - [847, 8384.52] - - [4096, 256, 1, 1536] - - [797, 8734.44] + - [859, 8734.44] - - [1225, 48, 64, 192] - - [784, 6516.46] + - [846, 6516.46] - - [4096, 384, 1, 1280] - - [794, 9023.34] + - [856, 9023.34] - - [1225, 64, 64, 256] - - [781, 8319.44] + - [843, 8319.44] - - [4096, 448, 1, 1280] - - [785, 8343.42] + - [847, 8343.42] - - [289, 128, 64, 768] - - [787, 7668.08] + - [849, 7668.08] - - [289, 256, 64, 1024] - - [788, 7535.56] + - [850, 7535.56] - - [4096, 448, 1, 2048] - - [785, 8572.41] + - [847, 8572.41] - - [5329, 80, 64, 64] - - [784, 6492.54] + - [846, 6492.54] - - [1225, 32, 64, 192] - - [777, 6278.64] + - [839, 6278.64] - - [289, 384, 64, 1024] - - [786, 7767.67] + - [848, 7767.67] - - [1024, 3594, 1, 4096] - - [805, 8661.52] + - [867, 8661.52] - - [4096, 3103, 1, 1024] - - [815, 9652.23] + - [877, 9652.23] - - [4096, 3136, 1, 1024] - - [799, 9723.15] + - [861, 9723.15] - - [1024, 3141, 1, 4096] - - [817, 8612.12] + - [879, 8612.12] - - [64, 147, 432, 148] - - [832, 6372.03] + - [894, 6372.03] - - [4096, 3559, 1, 1024] - - [804, 9906.35] + - [866, 9906.35] - - [4096, 3368, 1, 1024] - - [799, 9721.01] + - [861, 9721.01] - - [1024, 3335, 1, 4096] - - [823, 8990.29] + - [885, 8990.29] - - [1024, 3510, 1, 4096] - - [823, 9440.68] + - [885, 9440.68] - - [4096, 3209, 1, 1024] - - [804, 9632.76] + - [866, 9632.76] - - [4096, 3322, 1, 1024] - - [803, 9939.52] + - [865, 9939.52] - - [1024, 3400, 1, 4096] - - [822, 9156.09] + - [884, 9156.09] - - [1024, 3995, 1, 4096] - - [805, 9610.25] + - [867, 9610.25] - - [1024, 3503, 1, 4096] - - [823, 9446.57] + - [885, 9446.57] - - [4096, 3594, 1, 1024] - - [814, 9691.96] + - [876, 9691.96] - - [4096, 3473, 1, 1024] - - [803, 9698.9] + - [865, 9698.9] - - [4096, 3522, 1, 1024] - - [804, 9816.92] + - [866, 9816.92] - - [1024, 3103, 1, 4096] - - [801, 8491.05] + - [863, 8491.05] - - [1024, 3214, 1, 4096] - - [822, 8667.67] + - [884, 8667.67] - - [4096, 3449, 1, 1024] - - [814, 9795.71] + - [876, 9795.71] - - [1024, 3136, 1, 4096] - - [823, 8500.61] + - [885, 8500.61] - - [1024, 3955, 1, 33708] - - [803, 9634.94] + - [865, 9634.94] - - [1024, 3780, 1, 4096] - - [806, 9088.88] + - [868, 9088.88] - - [1024, 3906, 1, 33708] - - [804, 9515.46] + - [866, 9515.46] - - [1024, 3386, 1, 4096] - - [823, 9116.05] + - [885, 9116.05] - - [4096, 3396, 1, 1024] - - [814, 9665.6] + - [876, 9665.6] - - [1024, 3183, 1, 4096] - - [801, 8662.94] + - [863, 8662.94] - - [1024, 3098, 1, 4096] - - [817, 8490.22] + - [879, 8490.22] - - [1024, 3548, 1, 4096] - - [823, 9555.63] + - [885, 9555.63] - - [1024, 3224, 1, 4096] - - [816, 8760.88] + - [878, 8760.88] - - [4096, 3469, 1, 1024] - - [803, 9687.21] + - [865, 9687.21] - - [1024, 3582, 1, 4096] - - [820, 9691.0] + - [882, 9691.0] - - [1024, 2977, 1, 4096] - - [805, 9379.38] + - [867, 9379.38] - - [1024, 3939, 1, 1024] - - [802, 9172.11] + - [864, 9172.11] - - [64, 123, 528, 123] - - [850, 6346.17] + - [912, 6346.17] - - [64, 12, 5040, 12] - - [827, 1536.1] + - [889, 1536.1] - - [4096, 3176, 1, 1024] - - [815, 9712.2] + - [877, 9712.2] - - [1024, 3559, 1, 4096] - - [819, 9579.84] + - [881, 9579.84] - - [1024, 3478, 1, 4096] - - [823, 9373.85] + - [885, 9373.85] - - [4096, 3343, 1, 1024] - - [799, 9638.77] + - [861, 9638.77] - - [4096, 3440, 1, 1024] - - [799, 9853.96] + - [861, 9853.96] - - [1024, 3996, 1, 33708] - - [803, 9733.55] + - [865, 9733.55] - - [1024, 4012, 1, 4096] - - [804, 9636.99] + - [866, 9636.99] - - [1024, 3322, 1, 4096] - - [823, 8945.12] + - [885, 8945.12] - - [1024, 3990, 1, 33708] - - [803, 9720.31] + - [865, 9720.31] - - [1024, 3314, 1, 4096] - - [823, 8944.72] + - [885, 8944.72] - - [4096, 3513, 1, 1024] - - [803, 9794.95] + - [865, 9794.95] - - [1024, 3562, 1, 4096] - - [823, 9597.28] + - [885, 9597.28] - - [1024, 3443, 1, 4096] - - [823, 9279.52] + - [885, 9279.52] - - [1024, 3554, 1, 4096] - - [820, 9552.16] + - [882, 9552.16] - - [1024, 3063, 1, 4096] - - [805, 9622.58] + - [867, 9622.58] - - [64, 111, 576, 112] - - [850, 6274.65] + - [912, 6274.65] - - [4096, 3460, 1, 1024] - - [803, 9665.69] + - [865, 9665.69] - - [1024, 3209, 1, 4096] - - [802, 8708.39] + - [864, 8708.39] - - [1024, 3147, 1, 4096] - - [823, 8492.23] + - [885, 8492.23] - - [4096, 3387, 1, 1024] - - [800, 9761.34] + - [862, 9761.34] - - [4096, 3436, 1, 1024] - - [799, 9815.15] + - [861, 9815.15] - - [1024, 3341, 1, 4096] - - [822, 9005.07] + - [884, 9005.07] - - [1024, 3516, 1, 4096] - - [822, 9471.39] + - [884, 9471.39] - - [4096, 3277, 1, 1024] - - [803, 9807.12] + - [865, 9807.12] - - [1024, 3454, 1, 4096] - - [823, 9301.03] + - [885, 9301.03] - - [1024, 3969, 1, 4096] - - [803, 9539.82] + - [865, 9539.82] - - [1024, 3999, 1, 4096] - - [804, 9607.52] + - [866, 9607.52] - - [1024, 4032, 1, 4096] - - [805, 9693.47] + - [867, 9693.47] - - [4096, 3541, 1, 1024] - - [804, 9866.73] + - [866, 9866.73] - - [4096, 3334, 1, 1024] - - [815, 9614.41] + - [877, 9614.41] - - [1024, 3365, 1, 4096] - - [823, 9058.58] + - [885, 9058.58] - - [1024, 3527, 1, 4096] - - [823, 9510.31] + - [885, 9510.31] - - [1024, 3190, 1, 4096] - - [822, 8627.8] + - [884, 8627.8] - - [4096, 3906, 1, 1024] - - [800, 9817.78] + - [862, 9817.78] - - [1024, 3593, 1, 4096] - - [805, 8663.09] + - [867, 8663.09] - - [1024, 3336, 1, 4096] - - [823, 8991.13] + - [885, 8991.13] - - [4096, 3504, 1, 1024] - - [803, 9769.86] + - [865, 9769.86] - - [4096, 3977, 1, 1024] - - [804, 9742.62] + - [866, 9742.62] - - [1024, 3906, 1, 4096] - - [804, 9386.25] + - [866, 9386.25] - - [4096, 3415, 1, 1024] - - [814, 9802.7] + - [876, 9802.7] - - [1024, 3295, 1, 4096] - - [822, 8879.26] + - [884, 8879.26] - - [4096, 3321, 1, 1024] - - [804, 9931.43] + - [866, 9931.43] - - [1024, 3072, 1, 4096] - - [805, 9671.71] + - [867, 9671.71] - - [1024, 3408, 1, 4096] - - [822, 9182.83] + - [884, 9182.83] - - [1024, 3522, 1, 4096] - - [823, 9484.63] + - [885, 9484.63] - - [4096, 3751, 1, 1024] - - [804, 9778.86] + - [866, 9778.86] - - [4096, 3378, 1, 1024] - - [814, 9692.77] + - [876, 9692.77] - - [64, 77, 816, 77] - - [856, 4850.29] + - [918, 4850.29] - - [1024, 3925, 1, 33708] - - [803, 9560.88] + - [865, 9560.88] - - [1024, 3990, 1, 1024] - - [805, 9272.75] + - [867, 9272.75] - - [1024, 3290, 1, 4096] - - [816, 8905.61] + - [878, 8905.61] - - [4096, 3500, 1, 1024] - - [804, 9761.82] + - [866, 9761.82] - - [4096, 3565, 1, 1024] - - [803, 9919.37] + - [865, 9919.37] - - [1024, 3484, 1, 4096] - - [822, 9376.52] + - [884, 9376.52] - - [4096, 3395, 1, 1024] - - [815, 9788.16] + - [877, 9788.16] - - [64, 92, 688, 92] - - [842, 5606.1] + - [904, 5606.1] - - [1024, 3681, 1, 1024] - - [807, 8690.23] + - [869, 8690.23] - - [64, 159, 400, 159] - - [834, 6518.97] + - [896, 6518.97] - - [1024, 3584, 1, 1024] - - [822, 9365.37] + - [884, 9365.37] - - [4096, 3093, 1, 1024] - - [814, 9623.41] + - [876, 9623.41] - - [1024, 4050, 1, 1024] - - [806, 9354.14] + - [868, 9354.14] - - [1024, 3301, 1, 4096] - - [823, 8889.04] + - [885, 8889.04] - - [1024, 3581, 1, 4096] - - [822, 9673.82] + - [884, 9673.82] - - [4096, 3374, 1, 1024] - - [815, 9707.33] + - [877, 9707.33] - - [1024, 3449, 1, 4096] - - [823, 9270.9] + - [885, 9270.9] - - [4096, 3215, 1, 1024] - - [804, 9645.25] + - [866, 9645.25] - - [4096, 3312, 1, 1024] - - [804, 9888.72] + - [866, 9888.72] - - [4096, 3479, 1, 1024] - - [804, 9698.61] + - [866, 9698.61] - - [4096, 3544, 1, 1024] - - [804, 9875.09] + - [866, 9875.09] - - [1024, 3263, 1, 4096] - - [823, 8787.61] + - [885, 8787.61] - - [4096, 3455, 1, 1024] - - [814, 9845.29] + - [876, 9845.29] - - [1024, 3379, 1, 4096] - - [820, 9100.01] + - [882, 9100.01] - - [1024, 3490, 1, 4096] - - [823, 9397.49] + - [885, 9397.49] - - [1024, 3368, 1, 4096] - - [823, 9079.25] + - [885, 9079.25] - - [4096, 3186, 1, 1024] - - [799, 9750.17] + - [861, 9750.17] - - [1024, 3428, 1, 4096] - - [823, 9232.92] + - [885, 9232.92] - - [64, 85, 752, 84] - - [838, 5342.67] + - [900, 5342.67] - - [4096, 3561, 1, 1024] - - [804, 9914.02] + - [866, 9914.02] - - [4096, 3418, 1, 1024] - - [814, 9765.86] + - [876, 9765.86] - - [1024, 3064, 1, 4096] - - [805, 9621.68] + - [867, 9621.68] - - [4096, 3259, 1, 1024] - - [804, 9765.52] + - [866, 9765.52] - - [4096, 3308, 1, 1024] - - [803, 9900.46] + - [865, 9900.46] - - [1024, 3533, 1, 4096] - - [823, 9520.12] + - [885, 9520.12] - - [1024, 3344, 1, 4096] - - [823, 9014.55] + - [885, 9014.55] - - [1024, 4030, 1, 1024] - - [805, 9354.1] + - [867, 9354.1] - - [4096, 3459, 1, 1024] - - [804, 9656.2] + - [866, 9656.2] - - [1024, 3572, 1, 4096] - - [820, 9640.07] + - [882, 9640.07] - - [1024, 3925, 1, 1024] - - [816, 9173.74] + - [878, 9173.74] - - [4096, 3435, 1, 1024] - - [799, 9778.2] + - [861, 9778.2] - - [1024, 3956, 1, 4096] - - [806, 9498.56] + - [868, 9498.56] - - [1024, 3463, 1, 4096] - - [823, 9332.46] + - [885, 9332.46] - - [4096, 3182, 1, 1024] - - [814, 9826.84] + - [876, 9826.84] - - [4096, 3976, 1, 1024] - - [814, 9741.99] + - [876, 9741.99] - - [1024, 3417, 1, 4096] - - [823, 9208.97] + - [885, 9208.97] - - [1024, 3528, 1, 4096] - - [823, 9509.09] + - [885, 9509.09] - - [4096, 3446, 1, 1024] - - [814, 9816.97] + - [876, 9816.97] - - [64, 122, 528, 123] - - [850, 6325.98] + - [912, 6325.98] - - [1024, 3543, 1, 4096] - - [823, 9538.73] + - [885, 9538.73] - - [4096, 3287, 1, 1024] - - [803, 9846.04] + - [865, 9846.04] - - [1024, 3499, 1, 4096] - - [823, 9428.51] + - [885, 9428.51] - - [1024, 3231, 1, 4096] - - [816, 8769.91] + - [878, 8769.91] - - [64, 17, 3632, 17] - - [838, 1934.94] + - [900, 1934.94] - - [4096, 3519, 1, 1024] - - [803, 9804.38] + - [865, 9804.38] - - [4096, 3552, 1, 1024] - - [803, 9892.65] + - [865, 9892.65] - - [1024, 3458, 1, 4096] - - [823, 9312.28] + - [885, 9312.28] - - [64, 93, 688, 92] - - [842, 5660.22] + - [904, 5660.22] - - [1024, 3374, 1, 4096] - - [817, 9110.41] + - [879, 9110.41] - - [1024, 3396, 1, 4096] - - [823, 9145.79] + - [885, 9145.79] - - [1024, 2967, 1, 4096] - - [805, 9364.76] + - [867, 9364.76] - - [64, 19, 3264, 19] - - [842, 2142.47] + - [904, 2142.47] - - [4096, 3482, 1, 1024] - - [803, 9714.2] + - [865, 9714.2] - - [64, 32, 1984, 32] - - [853, 3619.91] + - [915, 3619.91] - - [64, 102, 624, 99] - - [844, 5515.33] + - [906, 5515.33] - - [1024, 3226, 1, 4096] - - [802, 8790.47] + - [864, 8790.47] - - [4096, 3377, 1, 1024] - - [800, 9684.08] + - [862, 9684.08] - - [4096, 3426, 1, 1024] - - [815, 9869.94] + - [877, 9869.94] - - [4096, 2935, 1, 1024] - - [815, 9762.11] + - [877, 9762.11] - - [64, 133, 480, 133] - - [854, 5891.32] + - [916, 5891.32] - - [1024, 3439, 1, 4096] - - [823, 9253.99] + - [885, 9253.99] - - [4096, 3267, 1, 1024] - - [803, 9783.9] + - [865, 9783.9] - - [4096, 3499, 1, 1024] - - [804, 9761.11] + - [866, 9761.11] - - [4096, 3356, 1, 1024] - - [815, 9679.44] + - [877, 9679.44] - - [64, 232, 272, 232] - - [858, 7181.03] + - [920, 7181.03] - - [64, 162, 400, 159] - - [818, 6444.63] + - [880, 6444.63] - - [4096, 3939, 1, 1024] - - [814, 9878.0] + - [876, 9878.0] - - [1024, 3526, 1, 4096] - - [823, 9508.1] + - [885, 9508.1] - - [1024, 3859, 1, 33708] - - [804, 9402.13] + - [866, 9402.13] - - [1024, 3385, 1, 4096] - - [822, 9107.28] + - [884, 9107.28] - - [1024, 3496, 1, 4096] - - [823, 9418.0] + - [885, 9418.0] - - [4096, 3141, 1, 1024] - - [815, 9682.54] + - [877, 9682.54] - - [4096, 3510, 1, 1024] - - [803, 9786.59] + - [865, 9786.59] - - [1024, 3434, 1, 4096] - - [823, 9246.7] + - [885, 9246.7] - - [4096, 3969, 1, 1024] - - [803, 9714.85] + - [865, 9714.85] - - [1024, 3121, 1, 4096] - - [801, 8464.32] + - [863, 8464.32] - - [1024, 3232, 1, 4096] - - [823, 8711.73] + - [885, 8711.73] - - [1024, 4030, 1, 33708] - - [804, 9816.31] + - [866, 9816.31] - - [1024, 3780, 1, 33708] - - [812, 9315.54] + - [874, 9315.54] - - [1024, 3969, 1, 1024] - - [801, 9248.54] + - [863, 9248.54] - - [4096, 3527, 1, 1024] - - [803, 9832.94] + - [865, 9832.94] - - [4096, 3336, 1, 1024] - - [800, 9623.35] + - [862, 9623.35] - - [4096, 3290, 1, 1024] - - [803, 9852.21] + - [865, 9852.21] - - [64, 9, 6544, 9] - - [843, 1068.24] + - [905, 1068.24] - - [1024, 3469, 1, 4096] - - [823, 9350.55] + - [885, 9350.55] - - [4096, 3490, 1, 1024] - - [803, 9737.56] + - [865, 9737.56] - - [4096, 3064, 1, 1024] - - [803, 9890.02] + - [865, 9890.02] - - [4096, 3582, 1, 1024] - - [804, 9961.38] + - [866, 9961.38] - - [1024, 3956, 1, 1024] - - [801, 9294.25] + - [863, 9294.25] - - [4096, 3417, 1, 1024] - - [799, 9811.66] + - [861, 9811.66] - - [1024, 2736, 1, 4096] - - [805, 8636.7] + - [867, 8636.7] - - [64, 78, 816, 78] - - [842, 4946.1] + - [904, 4946.1] - - [1024, 3205, 1, 4096] - - [817, 8657.21] + - [879, 8657.21] - - [1024, 3143, 1, 4096] - - [817, 8567.87] + - [879, 8567.87] - - [1024, 4020, 1, 4096] - - [805, 9664.62] + - [867, 9664.62] - - [1024, 3318, 1, 4096] - - [802, 8967.05] + - [864, 8967.05] - - [4096, 3364, 1, 1024] - - [815, 9697.18] + - [877, 9697.18] - - [1024, 3353, 1, 4096] - - [823, 9034.17] + - [885, 9034.17] - - [1024, 3464, 1, 4096] - - [823, 9326.05] + - [885, 9326.05] - - [4096, 3205, 1, 1024] - - [803, 9619.1] + - [865, 9619.1] - - [4096, 3318, 1, 1024] - - [804, 9932.66] + - [866, 9932.66] - - [1024, 3402, 1, 4096] - - [822, 9153.49] + - [884, 9153.49] - - [4096, 3181, 1, 1024] - - [814, 9789.15] + - [876, 9789.15] - - [4096, 3550, 1, 1024] - - [804, 9888.13] + - [866, 9888.13] - - [4096, 3445, 1, 1024] - - [814, 9752.65] + - [876, 9752.65] - - [1024, 3138, 1, 4096] - - [800, 8484.1] + - [862, 8484.1] - - [64, 99, 624, 99] - - [850, 5323.99] + - [912, 5323.99] - - [4096, 3079, 1, 1024] - - [800, 9562.26] + - [862, 9562.26] - - [4096, 3144, 1, 1024] - - [814, 9686.66] + - [876, 9686.66] - - [4096, 3860, 1, 1024] - - [815, 9733.42] + - [877, 9733.42] - - [1024, 3515, 1, 4096] - - [823, 9478.44] + - [885, 9478.44] - - [4096, 3408, 1, 1024] - - [800, 9764.96] + - [862, 9764.96] - - [64, 101, 624, 102] - - [850, 5482.79] + - [912, 5482.79] - - [1024, 3181, 1, 4096] - - [802, 8593.26] + - [864, 8593.26] - - [4096, 3298, 1, 1024] - - [804, 9867.72] + - [866, 9867.72] - - [4096, 3585, 1, 1024] - - [814, 9633.01] + - [876, 9633.01] - - [1024, 3550, 1, 4096] - - [823, 9564.46] + - [885, 9564.46] - - [1024, 4020, 1, 1024] - - [806, 9339.15] + - [868, 9339.15] - - [4096, 3481, 1, 1024] - - [804, 9714.0] + - [866, 9714.0] - - [4096, 3530, 1, 1024] - - [804, 9833.99] + - [866, 9833.99] - - [4096, 3425, 1, 1024] - - [800, 9675.66] + - [862, 9675.66] - - [4096, 4026, 1, 1024] - - [804, 9849.77] + - [866, 9849.77] - - [1024, 3860, 1, 1024] - - [817, 9073.59] + - [879, 9073.59] - - [4096, 3975, 1, 1024] - - [804, 9737.72] + - [866, 9737.72] - - [1024, 3286, 1, 4096] - - [801, 8884.24] + - [863, 8884.24] - - [1024, 3176, 1, 4096] - - [801, 8597.48] + - [863, 8597.48] - - [1024, 3894, 1, 4096] - - [805, 9359.13] + - [867, 9359.13] - - [4096, 3355, 1, 1024] - - [814, 9693.09] + - [876, 9693.09] - - [4096, 3404, 1, 1024] - - [814, 9786.12] + - [876, 9786.12] - - [1024, 3501, 1, 4096] - - [822, 9426.14] + - [884, 9426.14] - - [4096, 3245, 1, 1024] - - [804, 9723.57] + - [866, 9723.57] - - [1024, 3431, 1, 4096] - - [820, 9244.32] + - [882, 9244.32] - - [1024, 4000, 1, 1024] - - [816, 9344.03] + - [878, 9344.03] - - [4096, 3509, 1, 1024] - - [803, 9781.72] + - [865, 9781.72] - - [4096, 3558, 1, 1024] - - [804, 9905.15] + - [866, 9905.15] - - [1024, 3535, 1, 4096] - - [822, 9519.15] + - [884, 9519.15] - - [1024, 3414, 1, 4096] - - [820, 9198.05] + - [882, 9198.05] - - [1024, 3445, 1, 4096] - - [823, 9279.66] + - [885, 9279.66] - - [1024, 3436, 1, 4096] - - [823, 9259.7] + - [885, 9259.7] - - [4096, 3472, 1, 1024] - - [804, 9685.27] + - [866, 9685.27] - - [1024, 3211, 1, 4096] - - [802, 8708.41] + - [864, 8708.41] - - [64, 7, 8192, 7] - - [839, 802.916] + - [901, 802.916] - - [4096, 3383, 1, 1024] - - [814, 9734.82] + - [876, 9734.82] - - [4096, 3448, 1, 1024] - - [815, 9828.54] + - [877, 9828.54] - - [1024, 3343, 1, 4096] - - [816, 9010.46] + - [878, 9010.46] - - [1024, 3518, 1, 4096] - - [823, 9468.02] + - [885, 9468.02] - - [4096, 3289, 1, 1024] - - [804, 9844.16] + - [866, 9844.16] - - [1024, 3440, 1, 4096] - - [819, 9269.52] + - [881, 9269.52] - - [1024, 4032, 1, 33708] - - [803, 9822.41] + - [865, 9822.41] - - [4096, 3489, 1, 1024] - - [803, 9742.03] + - [865, 9742.03] - - [4096, 3346, 1, 1024] - - [800, 9616.74] + - [862, 9616.74] - - [1024, 3534, 1, 4096] - - [822, 9524.29] + - [884, 9524.29] - - [1024, 3079, 1, 4096] - - [817, 8397.77] + - [879, 8397.77] - - [1024, 3955, 1, 4096] - - [804, 9492.25] + - [866, 9492.25] - - [4096, 3236, 1, 1024] - - [804, 9706.03] + - [866, 9706.03] - - [1024, 3545, 1, 4096] - - [822, 9551.97] + - [884, 9551.97] - - [1024, 3144, 1, 4096] - - [816, 8556.8] + - [878, 8556.8] - - [4096, 3780, 1, 1024] - - [803, 9847.6] + - [865, 9847.6] - - [4096, 3163, 1, 1024] - - [814, 9717.79] + - [876, 9717.79] - - [4096, 3468, 1, 1024] - - [804, 9686.49] + - [866, 9686.49] - - [1024, 3539, 1, 4096] - - [823, 9526.99] + - [885, 9526.99] - - [1024, 3541, 1, 4096] - - [823, 9532.86] + - [885, 9532.86] - - [4096, 3363, 1, 1024] - - [799, 9699.1] + - [861, 9699.1] - - [1024, 3475, 1, 4096] - - [823, 9357.1] + - [885, 9357.1] - - [4096, 3110, 1, 1024] - - [815, 9659.68] + - [877, 9659.68] - - [1024, 3509, 1, 4096] - - [822, 9450.59] + - [884, 9450.59] - - [1024, 3413, 1, 4096] - - [823, 9185.91] + - [885, 9185.91] - - [1024, 3975, 1, 1024] - - [801, 9315.52] + - [863, 9315.52] - - [4096, 3549, 1, 1024] - - [804, 9884.82] + - [866, 9884.82] - - [4096, 3342, 1, 1024] - - [814, 9644.37] + - [876, 9644.37] - - [1024, 2985, 1, 4096] - - [804, 9392.17] + - [866, 9392.17] - - [1024, 3876, 1, 33708] - - [803, 9442.32] + - [865, 9442.32] - - [4096, 3280, 1, 1024] - - [803, 9820.02] + - [865, 9820.02] - - [4096, 3191, 1, 1024] - - [815, 9862.18] + - [877, 9862.18] - - [4096, 3512, 1, 1024] - - [804, 9793.21] + - [866, 9793.21] - - [1024, 3560, 1, 4096] - - [820, 9555.55] + - [882, 9555.55] - - [4096, 2499, 1, 1024] - - [804, 9669.45] + - [866, 9669.45] - - [1024, 3248, 1, 4096] - - [801, 8811.94] + - [863, 8811.94] - - [4096, 3423, 1, 1024] - - [815, 9729.77] + - [877, 9729.77] - - [64, 111, 576, 111] - - [850, 5982.73] + - [912, 5982.73] - - [4096, 3297, 1, 1024] - - [803, 9865.29] + - [865, 9865.29] - - [4096, 3154, 1, 1024] - - [815, 9613.52] + - [877, 9613.52] - - [1024, 3303, 1, 4096] - - [802, 8951.89] + - [864, 8951.89] - - [1024, 3222, 1, 4096] - - [822, 8682.99] + - [884, 8682.99] - - [1024, 3978, 1, 1024] - - [806, 9235.03] + - [868, 9235.03] - - [4096, 3529, 1, 1024] - - [804, 9831.72] + - [866, 9831.72] - - [4096, 3386, 1, 1024] - - [814, 9755.77] + - [876, 9755.77] - - [64, 134, 480, 134] - - [829, 5990.63] + - [891, 5990.63] - - [1024, 3451, 1, 4096] - - [820, 9277.71] + - [882, 9277.71] - - [4096, 3562, 1, 1024] - - [804, 9908.92] + - [866, 9908.92] - - [4096, 3276, 1, 1024] - - [803, 9818.14] + - [865, 9818.14] - - [64, 135, 480, 132] - - [858, 6071.87] + - [920, 6071.87] - - [1024, 3894, 1, 33708] - - [803, 9487.89] + - [865, 9487.89] - - [64, 134, 480, 132] - - [857, 6091.75] + - [919, 6091.75] - - [4096, 3540, 1, 1024] - - [804, 9862.89] + - [866, 9862.89] - - [1024, 3416, 1, 4096] - - [822, 9206.27] + - [884, 9206.27] - - [1024, 4005, 1, 33708] - - [803, 9757.29] + - [865, 9757.29] - - [1024, 3942, 1, 4096] - - [806, 9455.85] + - [868, 9455.85] - - [4096, 3403, 1, 1024] - - [814, 9739.46] + - [876, 9739.46] - - [4096, 3381, 1, 1024] - - [815, 9760.14] + - [877, 9760.14] - - [1024, 3492, 1, 4096] - - [819, 9391.79] + - [881, 9391.79] - - [4096, 3101, 1, 1024] - - [815, 9626.02] + - [877, 9626.02] - - [1024, 3430, 1, 4096] - - [823, 9232.14] + - [885, 9232.14] - - [1024, 3977, 1, 4096] - - [806, 9563.0] + - [868, 9563.0] - - [1024, 3640, 1, 4096] - - [805, 8761.5] + - [867, 8761.5] - - [4096, 3557, 1, 1024] - - [804, 9905.52] + - [866, 9905.52] - - [4096, 3414, 1, 1024] - - [800, 9755.49] + - [862, 9755.49] - - [1024, 3391, 1, 4096] - - [823, 9142.66] + - [885, 9142.66] - - [64, 134, 480, 135] - - [832, 5922.15] + - [894, 5922.15] - - [64, 16, 3840, 16] - - [848, 2080.61] + - [910, 2080.61] - - [1024, 3356, 1, 4096] - - [823, 9051.09] + - [885, 9051.09] - - [4096, 3320, 1, 1024] - - [804, 9929.57] + - [866, 9929.57] - - [4096, 2765, 1, 1024] - - [804, 9750.28] + - [866, 9750.28] - - [64, 162, 400, 162] - - [821, 6515.29] + - [883, 6515.29] - - [1024, 3411, 1, 4096] - - [823, 9185.72] + - [885, 9185.72] - - [1024, 3978, 1, 4096] - - [803, 9562.77] + - [865, 9562.77] - - [4096, 3487, 1, 1024] - - [804, 9733.85] + - [866, 9733.85] - - [4096, 3520, 1, 1024] - - [803, 9813.95] + - [865, 9813.95] - - [4096, 3942, 1, 1024] - - [814, 9804.39] + - [876, 9804.39] - - [4096, 3431, 1, 1024] - - [799, 9819.06] + - [861, 9819.06] - - [1024, 3271, 1, 4096] - - [816, 8913.08] + - [878, 8913.08] - - [4096, 4020, 1, 1024] - - [803, 9831.42] + - [865, 9831.42] - - [1024, 3481, 1, 4096] - - [819, 9376.15] + - [881, 9376.15] - - [1024, 3419, 1, 4096] - - [822, 9208.68] + - [884, 9208.68] - - [1024, 4059, 1, 4096] - - [806, 9733.83] + - [868, 9733.83] - - [4096, 3345, 1, 1024] - - [815, 9651.43] + - [877, 9651.43] - - [4096, 3394, 1, 1024] - - [815, 9780.43] + - [877, 9780.43] - - [1024, 3298, 1, 4096] - - [822, 8889.63] + - [884, 8889.63] - - [4096, 3235, 1, 1024] - - [804, 9705.81] + - [866, 9705.81] - - [1024, 3681, 1, 33708] - - [811, 9146.22] + - [873, 9146.22] - - [1024, 3840, 1, 4096] - - [804, 9253.95] + - [866, 9253.95] - - [1024, 3362, 1, 4096] - - [823, 9059.81] + - [885, 9059.81] - - [4096, 3467, 1, 1024] - - [803, 9677.51] + - [865, 9677.51] - - [1024, 3349, 1, 4096] - - [823, 9034.07] + - [885, 9034.07] - - [1024, 3460, 1, 4096] - - [823, 9322.94] + - [885, 9322.94] - - [4096, 3214, 1, 1024] - - [804, 9644.46] + - [866, 9644.46] - - [1024, 3398, 1, 4096] - - [823, 9157.29] + - [885, 9157.29] - - [4096, 3478, 1, 1024] - - [803, 9706.66] + - [865, 9706.66] - - [1024, 4050, 1, 33708] - - [803, 9865.14] + - [865, 9865.14] - - [1024, 3244, 1, 4096] - - [819, 8744.53] + - [881, 8744.53] - - [4096, 3341, 1, 1024] - - [815, 9646.79] + - [877, 9646.79] - - [4096, 3454, 1, 1024] - - [800, 9880.56] + - [862, 9880.56] - - [1024, 3166, 1, 4096] - - [817, 8618.46] + - [879, 8618.46] - - [1024, 3425, 1, 4096] - - [823, 9225.32] + - [885, 9225.32] - - [4096, 3295, 1, 1024] - - [804, 9863.81] + - [866, 9863.81] - - [4096, 3072, 1, 1024] - - [803, 9971.09] + - [865, 9971.09] - - [4096, 3822, 1, 1024] - - [804, 9952.07] + - [866, 9952.07] - - [1024, 3681, 1, 4096] - - [805, 8856.94] + - [867, 8856.94] - - [1024, 4050, 1, 4096] - - [805, 9717.58] + - [867, 9717.58] - - [4096, 3495, 1, 1024] - - [803, 9741.14] + - [865, 9741.14] - - [4096, 3560, 1, 1024] - - [804, 9909.14] + - [866, 9909.14] - - [1024, 3524, 1, 4096] - - [822, 9503.2] + - [884, 9503.2] - - [1024, 3942, 1, 33708] - - [803, 9602.67] + - [865, 9602.67] - - [1024, 3304, 1, 4096] - - [802, 8928.76] + - [864, 8928.76] - - [1024, 3387, 1, 4096] - - [823, 9127.65] + - [885, 9127.65] - - [1024, 3498, 1, 4096] - - [822, 9423.39] + - [884, 9423.39] - - [4096, 3458, 1, 1024] - - [803, 9642.63] + - [865, 9642.63] - - [4096, 2967, 1, 1024] - - [803, 9626.71] + - [865, 9626.71] - - [64, 8, 7280, 8] - - [825, 1032.61] + - [887, 1032.61] - - [4096, 3385, 1, 1024] - - [799, 9735.77] + - [861, 9735.77] - - [4096, 3434, 1, 1024] - - [814, 9808.9] + - [876, 9808.9] - - [1024, 3519, 1, 4096] - - [823, 9484.83] + - [885, 9484.83] - - [1024, 3511, 1, 4096] - - [823, 9456.47] + - [885, 9456.47] - - [1024, 3288, 1, 4096] - - [822, 8864.05] + - [884, 8864.05] - - [1024, 2918, 1, 4096] - - [805, 9170.35] + - [867, 9170.35] - - [4096, 3573, 1, 1024] - - [804, 9945.85] + - [866, 9945.85] - - [1024, 3822, 1, 33708] - - [813, 9331.0] + - [875, 9331.0] - - [64, 102, 624, 102] - - [850, 5531.17] + - [912, 5531.17] - - [4096, 3539, 1, 1024] - - [804, 9855.39] + - [866, 9855.39] - - [4096, 3332, 1, 1024] - - [815, 9648.97] + - [877, 9648.97] - - [4096, 3286, 1, 1024] - - [804, 9846.42] + - [866, 9846.42] - - [1024, 4026, 1, 4096] - - [805, 9675.94] + - [867, 9675.94] - - [1024, 3277, 1, 4096] - - [819, 8836.21] + - [881, 8836.21] - - [1024, 3471, 1, 4096] - - [823, 9346.33] + - [885, 9346.33] - - [4096, 3518, 1, 1024] - - [804, 9804.2] + - [866, 9804.2] - - [1024, 3393, 1, 4096] - - [823, 9148.99] + - [885, 9148.99] - - [4096, 3413, 1, 1024] - - [800, 9785.17] + - [862, 9785.17] - - [4096, 3303, 1, 1024] - - [804, 9884.37] + - [866, 9884.37] - - [1024, 3207, 1, 4096] - - [801, 8714.69] + - [863, 8714.69] - - [1024, 3894, 1, 1024] - - [817, 9181.51] + - [879, 9181.51] - - [1024, 3977, 1, 1024] - - [817, 9240.9] + - [879, 9240.9] - - [64, 135, 480, 133] - - [832, 5923.4] + - [894, 5923.4] - - [4096, 3535, 1, 1024] - - [804, 9839.55] + - [866, 9839.55] - - [4096, 3376, 1, 1024] - - [799, 9712.02] + - [861, 9712.02] - - [1024, 3355, 1, 4096] - - [823, 9043.27] + - [885, 9043.27] - - [64, 27, 2336, 27] - - [851, 2929.9] + - [913, 2929.9] - - [1024, 3466, 1, 4096] - - [823, 9339.1] + - [885, 9339.1] - - [4096, 3266, 1, 1024] - - [804, 9789.29] + - [866, 9789.29] - - [1024, 3404, 1, 4096] - - [823, 9176.76] + - [885, 9176.76] - - [1024, 3999, 1, 1024] - - [816, 9391.91] + - [878, 9391.91] - - [64, 148, 432, 143] - - [829, 6182.92] + - [891, 6182.92] - - [4096, 3498, 1, 1024] - - [803, 9764.56] + - [865, 9764.56] - - [1024, 4032, 1, 1024] - - [801, 9402.03] + - [863, 9402.03] - - [1024, 3410, 1, 4096] - - [822, 9183.5] + - [884, 9183.5] - - [4096, 3393, 1, 1024] - - [815, 9695.49] + - [877, 9695.49] - - [1024, 3140, 1, 4096] - - [816, 8504.86] + - [878, 8504.86] - - [1024, 3910, 1, 33708] - - [803, 9526.06] + - [865, 9526.06] - - [1024, 3334, 1, 4096] - - [822, 8987.59] + - [884, 8987.59] - - [4096, 3140, 1, 1024] - - [815, 9660.71] + - [877, 9660.71] - - [1024, 4005, 1, 4096] - - [806, 9629.88] + - [868, 9629.88] - - [1024, 3579, 1, 4096] - - [822, 9661.45] + - [884, 9661.45] - - [4096, 3372, 1, 1024] - - [815, 9697.32] + - [877, 9697.32] - - [1024, 3245, 1, 4096] - - [816, 8847.76] + - [878, 8847.76] - - [64, 38, 1680, 38] - - [826, 3340.44] + - [888, 3340.44] - - [4096, 3956, 1, 1024] - - [815, 9911.15] + - [877, 9911.15] - - [4096, 3213, 1, 1024] - - [803, 9643.11] + - [865, 9643.11] - - [1024, 3361, 1, 4096] - - [823, 9062.24] + - [885, 9062.24] - - [1024, 3536, 1, 4096] - - [822, 9530.65] + - [884, 9530.65] - - [1024, 3968, 1, 1024] - - [817, 9377.92] + - [879, 9377.92] - - [4096, 3477, 1, 1024] - - [804, 9700.77] + - [866, 9700.77] - - [4096, 3526, 1, 1024] - - [804, 9824.41] + - [866, 9824.41] - - [1024, 4005, 1, 1024] - - [801, 9362.39] + - [863, 9362.39] - - [1024, 3530, 1, 4096] - - [820, 9487.17] + - [882, 9487.17] - - [1024, 3944, 1, 4096] - - [805, 9464.55] + - [867, 9464.55] - - [4096, 3453, 1, 1024] - - [814, 9826.77] + - [876, 9826.77] - - [4096, 3184, 1, 1024] - - [815, 9833.59] + - [877, 9833.59] - - [4096, 3579, 1, 1024] - - [804, 9962.55] + - [866, 9962.55] - - [4096, 3351, 1, 1024] - - [815, 9653.34] + - [877, 9653.34] - - [4096, 3416, 1, 1024] - - [799, 9810.4] + - [861, 9810.4] - - [64, 100, 624, 100] - - [850, 5408.55] + - [912, 5408.55] - - [1024, 3822, 1, 4096] - - [805, 9196.2] + - [867, 9196.2] - - [1024, 3796, 1, 4096] - - [805, 9131.96] + - [867, 9131.96] - - [4096, 3257, 1, 1024] - - [803, 9767.34] + - [865, 9767.34] - - [4096, 3306, 1, 1024] - - [803, 9893.35] + - [865, 9893.35] - - [1024, 3505, 1, 4096] - - [823, 9450.02] + - [885, 9450.02] - - [1024, 3315, 1, 4096] - - [816, 8979.77] + - [878, 8979.77] - - [1024, 3486, 1, 4096] - - [822, 9393.48] + - [884, 9393.48] - - [4096, 3457, 1, 1024] - - [803, 9653.19] + - [865, 9653.19] - - [4096, 3870, 1, 1024] - - [800, 9717.51] + - [862, 9717.51] - - [1024, 3447, 1, 4096] - - [823, 9273.14] + - [885, 9273.14] - - [1024, 3558, 1, 4096] - - [820, 9567.33] + - [882, 9567.33] - - [4096, 3433, 1, 1024] - - [800, 9759.26] + - [862, 9759.26] - - [4096, 3180, 1, 1024] - - [815, 9738.63] + - [877, 9738.63] - - [1024, 3213, 1, 4096] - - [801, 8692.25] + - [863, 8692.25] - - [1024, 3900, 1, 4096] - - [805, 9388.61] + - [867, 9388.61] - - [4096, 3444, 1, 1024] - - [814, 9869.73] + - [876, 9869.73] - - [1024, 3504, 1, 4096] - - [823, 9429.38] + - [885, 9429.38] - - [4096, 4059, 1, 1024] - - [804, 9920.79] + - [866, 9920.79] - - [1024, 3442, 1, 4096] - - [823, 9273.01] + - [885, 9273.01] - - [4096, 3517, 1, 1024] - - [803, 9808.19] + - [865, 9808.19] - - [1024, 3566, 1, 4096] - - [822, 9622.89] + - [884, 9622.89] - - [4096, 3248, 1, 1024] - - [803, 9730.33] + - [865, 9730.33] - - [1024, 3547, 1, 4096] - - [822, 9564.73] + - [884, 9564.73] - - [64, 59, 1088, 59] - - [841, 4611.76] + - [903, 4611.76] - - [1024, 3340, 1, 4096] - - [822, 8992.21] + - [884, 8992.21] - - [4096, 3480, 1, 1024] - - [804, 9710.17] + - [866, 9710.17] - - [1024, 3968, 1, 4096] - - [804, 9543.11] + - [866, 9543.11] - - [4096, 3424, 1, 1024] - - [800, 9808.66] + - [862, 9808.66] - - [1024, 3906, 1, 1024] - - [802, 9150.54] + - [864, 9150.54] - - [4096, 3265, 1, 1024] - - [803, 9786.85] + - [865, 9786.85] - - [1024, 3384, 1, 4096] - - [823, 9119.56] + - [885, 9119.56] - - [1024, 3494, 1, 4096] - - [820, 9415.52] + - [882, 9415.52] - - [1024, 3236, 1, 4096] - - [817, 8767.14] + - [879, 8767.14] - - [4096, 3497, 1, 1024] - - [804, 9750.86] + - [866, 9750.86] - - [4096, 3354, 1, 1024] - - [815, 9665.17] + - [877, 9665.17] - - [4096, 3055, 1, 1024] - - [804, 9884.09] + - [866, 9884.09] - - [64, 11, 5456, 11] - - [827, 1368.34] + - [889, 1368.34] - - [4096, 3244, 1, 1024] - - [803, 9720.02] + - [865, 9720.02] - - [4096, 3139, 1, 1024] - - [814, 9737.06] + - [876, 9737.06] - - [4096, 3508, 1, 1024] - - [803, 9771.66] + - [865, 9771.66] - - [4096, 4050, 1, 1024] - - [803, 9898.79] + - [865, 9898.79] - - [1024, 3472, 1, 4096] - - [822, 9353.83] + - [884, 9353.83] - - [1024, 3861, 1, 1024] - - [801, 9061.32] + - [863, 9061.32] - - [1024, 3910, 1, 1024] - - [805, 9043.54] + - [867, 9043.54] - - [4096, 3371, 1, 1024] - - [815, 9738.24] + - [877, 9738.24] - - [64, 65, 992, 65] - - [854, 4354.59] + - [916, 4354.59] - - [1024, 3751, 1, 4096] - - [804, 9018.74] + - [866, 9018.74] - - [4096, 3325, 1, 1024] - - [803, 9958.73] + - [865, 9958.73] - - [1024, 3321, 1, 4096] - - [823, 8952.55] + - [885, 8952.55] - - [1024, 3944, 1, 1024] - - [802, 9117.35] + - [864, 9117.35] - - [4096, 3525, 1, 1024] - - [804, 9822.14] + - [866, 9822.14] - - [4096, 3382, 1, 1024] - - [815, 9720.21] + - [877, 9720.21] - - [64, 122, 528, 122] - - [850, 6389.33] + - [912, 6389.33] - - [1024, 3453, 1, 4096] - - [820, 9305.03] + - [882, 9305.03] - - [4096, 3564, 1, 1024] - - [803, 9911.32] + - [865, 9911.32] - - [4096, 3288, 1, 1024] - - [803, 9841.17] + - [865, 9841.17] - - [1024, 3925, 1, 4096] - - [804, 9418.95] + - [866, 9418.95] - - [1024, 3057, 1, 4096] - - [805, 9590.51] + - [867, 9590.51] - - [4096, 3488, 1, 1024] - - [804, 9732.5] + - [866, 9732.5] - - [4096, 3046, 1, 1024] - - [804, 9850.72] + - [866, 9850.72] - - [1024, 3189, 1, 4096] - - [816, 8677.02] + - [878, 8677.02] - - [4096, 3399, 1, 1024] - - [800, 9673.09] + - [862, 9673.09] - - [1024, 3383, 1, 4096] - - [823, 9102.37] + - [885, 9102.37] - - [1024, 3415, 1, 4096] - - [823, 9216.37] + - [885, 9216.37] - - [1024, 3388, 1, 4096] - - [823, 9127.53] + - [885, 9127.53] - - [1024, 3376, 1, 4096] - - [820, 9090.53] + - [882, 9090.53] - - [1024, 3473, 1, 4096] - - [823, 9354.12] + - [885, 9354.12] - - [4096, 3162, 1, 1024] - - [799, 9694.83] + - [861, 9694.83] - - [1024, 3448, 1, 4096] - - [823, 9283.45] + - [885, 9283.45] - - [4096, 3362, 1, 1024] - - [815, 9673.33] + - [877, 9673.33] - - [64, 228, 272, 228] - - [808, 7039.13] + - [870, 7039.13] - - [1024, 3262, 1, 4096] - - [817, 8850.84] + - [879, 8850.84] - - [1024, 3184, 1, 4096] - - [802, 8625.37] + - [864, 8625.37] - - [1024, 3378, 1, 4096] - - [822, 9105.27] + - [884, 9105.27] - - [4096, 3548, 1, 1024] - - [803, 9877.83] + - [865, 9877.83] - - [4096, 2977, 1, 1024] - - [803, 9647.81] + - [865, 9647.81] - - [64, 21, 2976, 21] - - [838, 2364.81] + - [900, 2364.81] - - [64, 112, 576, 111] - - [837, 5973.68] + - [899, 5973.68] - - [4096, 3443, 1, 1024] - - [799, 9784.5] + - [861, 9784.5] - - [1024, 3289, 1, 4096] - - [823, 8874.04] + - [885, 8874.04] - - [1024, 3483, 1, 4096] - - [819, 9380.57] + - [881, 9380.57] - - [4096, 3190, 1, 1024] - - [815, 9850.96] + - [877, 9850.96] - - [1024, 3421, 1, 4096] - - [823, 9214.06] + - [885, 9214.06] - - [1024, 3514, 1, 4096] - - [822, 9458.23] + - [884, 9458.23] - - [1024, 3532, 1, 4096] - - [823, 9513.03] + - [885, 9513.03] - - [1024, 3565, 1, 4096] - - [822, 9630.6] + - [884, 9630.6] - - [4096, 3422, 1, 1024] - - [800, 9733.79] + - [862, 9733.79] - - [4096, 3263, 1, 1024] - - [804, 9776.94] + - [866, 9776.94] - - [4096, 3296, 1, 1024] - - [804, 9860.61] + - [866, 9860.61] - - [4096, 3640, 1, 1024] - - [814, 9782.3] + - [876, 9782.3] - - [4096, 3463, 1, 1024] - - [803, 9672.0] + - [865, 9672.0] - - [4096, 3528, 1, 1024] - - [804, 9829.98] + - [866, 9829.98] - - [1024, 3351, 1, 4096] - - [817, 9054.37] + - [879, 9054.37] - - [1024, 3462, 1, 4096] - - [823, 9327.85] + - [885, 9327.85] - - [4096, 3226, 1, 1024] - - [804, 9674.93] + - [866, 9674.93] - - [4096, 3439, 1, 1024] - - [799, 9823.18] + - [861, 9823.18] - - [4096, 3121, 1, 1024] - - [799, 9672.64] + - [861, 9672.64] - - [1024, 4059, 1, 33708] - - [803, 9885.72] + - [865, 9885.72] - - [1024, 3311, 1, 4096] - - [823, 8910.01] + - [885, 8910.01] - - [1024, 3230, 1, 4096] - - [823, 8705.9] + - [885, 8705.9] - - [4096, 3353, 1, 1024] - - [815, 9671.86] + - [877, 9671.86] - - [4096, 3402, 1, 1024] - - [800, 9727.04] + - [862, 9727.04] - - [1024, 3427, 1, 4096] - - [823, 9233.55] + - [885, 9233.55] - - [1024, 3346, 1, 4096] - - [823, 9015.77] + - [885, 9015.77] - - [1024, 3126, 1, 4096] - - [817, 8519.31] + - [879, 8519.31] - - [1024, 3796, 1, 1024] - - [801, 8916.75] + - [863, 8916.75] - - [1024, 3990, 1, 4096] - - [805, 9600.86] + - [867, 9600.86] - - [1024, 3257, 1, 4096] - - [801, 8790.42] + - [863, 8790.42] - - [4096, 3996, 1, 1024] - - [804, 9788.25] + - [866, 9788.25] - - [64, 143, 432, 143] - - [832, 6087.24] + - [894, 6087.24] - - [1024, 3306, 1, 4096] - - [816, 9035.69] + - [878, 9035.69] - - [1024, 3389, 1, 4096] - - [823, 9134.92] + - [885, 9134.92] - - [1024, 3500, 1, 4096] - - [823, 9443.33] + - [885, 9443.33] - - [1024, 3999, 1, 33708] - - [804, 9741.24] + - [866, 9741.24] - - [4096, 3486, 1, 1024] - - [804, 9719.67] + - [866, 9719.67] - - [1024, 3438, 1, 4096] - - [823, 9259.38] + - [885, 9259.38] - - [4096, 3616, 1, 1024] - - [814, 9739.77] + - [876, 9739.77] - - [1024, 3955, 1, 1024] - - [816, 9260.37] + - [878, 9260.37] - - [4096, 3430, 1, 1024] - - [815, 9819.95] + - [877, 9819.95] - - [4096, 3271, 1, 1024] - - [804, 9802.04] + - [866, 9802.04] - - [1024, 3364, 1, 4096] - - [816, 9144.63] + - [878, 9144.63] - - [64, 54, 1184, 54] - - [836, 4315.78] + - [898, 4315.78] - - [1024, 3497, 1, 4096] - - [823, 9429.42] + - [885, 9429.42] - - [4096, 3503, 1, 1024] - - [803, 9764.48] + - [865, 9764.48] - - [4096, 3344, 1, 1024] - - [800, 9614.16] + - [862, 9614.16] - - [1024, 3457, 1, 4096] - - [823, 9320.6] + - [885, 9320.6] - - [4096, 3466, 1, 1024] - - [803, 9677.81] + - [865, 9677.81] - - [1024, 3976, 1, 33708] - - [804, 9685.38] + - [866, 9685.38] - - [1024, 3395, 1, 4096] - - [822, 9146.39] + - [884, 9146.39] - - [4096, 3361, 1, 1024] - - [814, 9677.89] + - [876, 9677.89] - - [1024, 3751, 1, 33708] - - [812, 9234.69] + - [874, 9234.69] - - [1024, 3822, 1, 1024] - - [801, 8977.83] + - [863, 8977.83] - - [4096, 3315, 1, 1024] - - [804, 9922.54] + - [866, 9922.54] - - [1024, 3163, 1, 4096] - - [816, 8577.79] + - [878, 8577.79] - - [4096, 3547, 1, 1024] - - [804, 9882.92] + - [866, 9882.92] - - [4096, 3340, 1, 1024] - - [814, 9635.42] + - [876, 9635.42] - - [1024, 3296, 1, 4096] - - [823, 8874.66] + - [885, 8874.66] - - [1024, 3468, 1, 4096] - - [823, 9350.26] + - [885, 9350.26] - - [4096, 3294, 1, 1024] - - [803, 9856.87] + - [865, 9856.87] - - [1024, 3406, 1, 4096] - - [819, 9162.84] + - [881, 9162.84] - - [1024, 3860, 1, 33708] - - [803, 9403.56] + - [865, 9403.56] - - [1024, 3584, 1, 4096] - - [820, 9677.44] + - [882, 9677.44] - - [4096, 3189, 1, 1024] - - [815, 9820.69] + - [877, 9820.69] - - [4096, 3494, 1, 1024] - - [803, 9747.68] + - [865, 9747.68] - - [64, 135, 480, 135] - - [829, 5966.34] + - [891, 5966.34] - - [1024, 3093, 1, 4096] - - [817, 8446.06] + - [879, 8446.06] - - [4096, 3421, 1, 1024] - - [800, 9776.03] + - [862, 9776.03] - - [1024, 3479, 1, 4096] - - [823, 9376.54] + - [885, 9376.54] - - [1024, 3433, 1, 4096] - - [823, 9251.14] + - [885, 9251.14] - - [4096, 3311, 1, 1024] - - [803, 9901.53] + - [865, 9901.53] - - [1024, 3381, 1, 4096] - - [823, 9103.99] + - [885, 9103.99] - - [1024, 3996, 1, 4096] - - [804, 9609.56] + - [866, 9609.56] - - [4096, 3384, 1, 1024] - - [814, 9750.01] + - [876, 9750.01] - - [1024, 3247, 1, 4096] - - [802, 8872.59] + - [864, 8872.59] - - [1024, 3169, 1, 4096] - - [801, 8597.61] + - [863, 8597.61] - - [1024, 3088, 1, 4096] - - [817, 8410.07] + - [879, 8410.07] - - [1024, 3363, 1, 4096] - - [823, 9069.5] + - [885, 9069.5] - - [1024, 3538, 1, 4096] - - [822, 9529.68] + - [884, 9529.68] - - [1024, 3996, 1, 1024] - - [806, 9323.06] + - [868, 9323.06] - - [4096, 3169, 1, 1024] - - [800, 9821.4] + - [862, 9821.4] - - [4096, 3538, 1, 1024] - - [803, 9859.42] + - [865, 9859.42] - - [4096, 3401, 1, 1024] - - [800, 9754.5] + - [862, 9754.5] - - [4096, 3581, 1, 1024] - - [803, 9960.71] + - [865, 9960.71] - - [1024, 3180, 1, 4096] - - [801, 8635.05] + - [863, 8635.05] - - [1024, 3870, 1, 1024] - - [802, 9085.69] + - [864, 9085.69] - - [4096, 3555, 1, 1024] - - [803, 9905.74] + - [865, 9905.74] - - [4096, 3412, 1, 1024] - - [815, 9778.56] + - [877, 9778.56] - - [4096, 3302, 1, 1024] - - [803, 9888.71] + - [865, 9888.71] - - [1024, 3561, 1, 4096] - - [819, 9597.05] + - [881, 9597.05] - - [1024, 3302, 1, 4096] - - [823, 8900.87] + - [885, 8900.87] - - [1024, 3976, 1, 4096] - - [805, 9563.22] + - [867, 9563.22] - - [4096, 3485, 1, 1024] - - [803, 9722.57] + - [865, 9722.57] - - [4096, 3534, 1, 1024] - - [803, 9847.22] + - [865, 9847.22] - - [1024, 3110, 1, 4096] - - [816, 8458.56] + - [878, 8458.56] - - [1024, 3401, 1, 4096] - - [823, 9174.81] + - [885, 9174.81] - - [4096, 3216, 1, 1024] - - [803, 9645.49] + - [865, 9645.49] - - [1024, 4020, 1, 33708] - - [803, 9793.61] + - [865, 9793.61] - - [1024, 3215, 1, 4096] - - [823, 8677.51] + - [885, 8677.51] - - [4096, 3566, 1, 1024] - - [803, 9924.78] + - [865, 9924.78] - - [1024, 3137, 1, 4096] - - [801, 8547.07] + - [863, 8547.07] - - [4096, 3359, 1, 1024] - - [800, 9673.73] + - [862, 9673.73] - - [4096, 3392, 1, 1024] - - [815, 9757.51] + - [877, 9757.51] - - [1024, 3506, 1, 4096] - - [823, 9443.0] + - [885, 9443.0] - - [4096, 3233, 1, 1024] - - [803, 9698.7] + - [865, 9698.7] - - [1024, 3444, 1, 4096] - - [823, 9275.54] + - [885, 9275.54] - - [1024, 3975, 1, 4096] - - [804, 9556.87] + - [866, 9556.87] - - [1024, 3870, 1, 33708] - - [803, 9427.44] + - [865, 9427.44] - - [4096, 3465, 1, 1024] - - [804, 9675.01] + - [866, 9675.01] - - [4096, 3968, 1, 1024] - - [800, 9927.93] + - [862, 9927.93] - - [1024, 3523, 1, 4096] - - [823, 9494.15] + - [885, 9494.15] - - [64, 10, 5952, 10] - - [827, 1224.16] + - [889, 1224.16] - - [4096, 3990, 1, 1024] - - [803, 9771.27] + - [865, 9771.27] - - [1024, 3549, 1, 4096] - - [822, 9553.42] + - [884, 9553.42] - - [1024, 3342, 1, 4096] - - [823, 9007.31] + - [885, 9007.31] - - [4096, 3476, 1, 1024] - - [803, 9703.66] + - [865, 9703.66] - - [64, 232, 272, 228] - - [809, 7078.93] + - [871, 7078.93] - - [1024, 3418, 1, 4096] - - [823, 9213.09] + - [885, 9213.09] - - [1024, 3859, 1, 1024] - - [802, 9087.54] + - [864, 9087.54] - - [4096, 3339, 1, 1024] - - [815, 9594.0] + - [877, 9594.0] - - [4096, 3452, 1, 1024] - - [800, 9872.69] + - [862, 9872.69] - - [4096, 3293, 1, 1024] - - [803, 9842.65] + - [865, 9842.65] - - [4096, 3840, 1, 1024] - - [804, 10030.8] + - [866, 10030.8] - - [1024, 3369, 1, 4096] - - [801, 9099.72] + - [863, 9099.72] - - [64, 193, 320, 193] - - [831, 6425.8] + - [893, 6425.8] - - [1024, 3544, 1, 4096] - - [820, 9556.64] + - [882, 9556.64] - - [4096, 3493, 1, 1024] - - [804, 9743.34] + - [866, 9743.34] - - [4096, 3350, 1, 1024] - - [815, 9653.11] + - [877, 9653.11] - - [64, 71, 896, 71] - - [855, 4686.73] + - [917, 4686.73] - - [4096, 3256, 1, 1024] - - [803, 9763.78] + - [865, 9763.78] - - [1024, 3870, 1, 4096] - - [805, 9305.28] + - [867, 9305.28] - - [4096, 4012, 1, 1024] - - [804, 9817.35] + - [866, 9817.35] - - [1024, 3280, 1, 4096] - - [823, 8842.02] + - [885, 8842.02] - - [4096, 3456, 1, 1024] - - [799, 9874.43] + - [861, 9874.43] - - [1024, 3555, 1, 4096] - - [822, 9599.63] + - [884, 9599.63] - - [4096, 3014, 1, 1024] - - [803, 9762.28] + - [865, 9762.28] - - [1024, 3474, 1, 4096] - - [823, 9373.67] + - [885, 9373.67] - - [4096, 3367, 1, 1024] - - [799, 9694.64] + - [861, 9694.64] - - [4096, 3432, 1, 1024] - - [815, 9855.27] + - [877, 9855.27] - - [64, 84, 752, 84] - - [842, 5247.18] + - [904, 5247.18] - - [4096, 3273, 1, 1024] - - [804, 9801.87] + - [866, 9801.87] - - [4096, 3130, 1, 1024] - - [800, 9672.52] + - [862, 9672.52] - - [1024, 2984, 1, 4096] - - [805, 9403.7] + - [867, 9403.7] - - [1024, 3995, 1, 1024] - - [817, 9392.61] + - [879, 9392.61] - - [1024, 3517, 1, 4096] - - [823, 9481.39] + - [885, 9481.39] - - [1024, 3455, 1, 4096] - - [823, 9302.29] + - [885, 9302.29] - - [1024, 3939, 1, 4096] - - [805, 9469.89] + - [867, 9469.89] - - [64, 49, 1296, 49] - - [835, 3938.96] + - [897, 3938.96] - - [64, 14, 4368, 14] - - [827, 1802.47] + - [889, 1802.47] - - [64, 25, 2512, 25] - - [846, 2760.54] + - [908, 2760.54] - - [4096, 3147, 1, 1024] - - [815, 9713.03] + - [877, 9713.03] - - [4096, 3516, 1, 1024] - - [803, 9805.93] + - [865, 9805.93] - - [1024, 3876, 1, 4096] - - [805, 9320.56] + - [867, 9320.56] - - [1024, 3191, 1, 4096] - - [802, 8640.76] + - [864, 8640.76] - - [4096, 3411, 1, 1024] - - [814, 9737.37] + - [876, 9737.37] - - [1024, 3337, 1, 4096] - - [823, 8990.13] + - [885, 8990.13] - - [1024, 3512, 1, 4096] - - [823, 9459.65] + - [885, 9459.65] - - [4096, 3301, 1, 1024] - - [803, 9877.26] + - [865, 9877.26] - - [1024, 3450, 1, 4096] - - [822, 9283.11] + - [884, 9283.11] - - [4096, 3533, 1, 1024] - - [803, 9848.62] + - [865, 9848.62] - - [4096, 3390, 1, 1024] - - [815, 9764.61] + - [877, 9764.61] - - [4096, 3231, 1, 1024] - - [803, 9693.81] + - [865, 9693.81] - - [1024, 2499, 1, 4096] - - [822, 9304.81] + - [884, 9304.81] - - [1024, 3186, 1, 4096] - - [802, 8649.55] + - [864, 8649.55] - - [1024, 3380, 1, 4096] - - [823, 9101.77] + - [885, 9101.77] - - [4096, 3496, 1, 1024] - - [804, 9754.3] + - [866, 9754.3] - - [1024, 3956, 1, 33708] - - [803, 9636.77] + - [865, 9636.77] - - [1024, 3976, 1, 1024] - - [805, 9248.41] + - [867, 9248.41] - - [4096, 2736, 1, 1024] - - [803, 9651.91] + - [865, 9651.91] - - [1024, 3291, 1, 4096] - - [823, 8868.94] + - [885, 8868.94] - - [1024, 3944, 1, 33708] - - [804, 9607.0] + - [866, 9607.0] - - [1024, 3485, 1, 4096] - - [822, 9385.96] + - [884, 9385.96] - - [4096, 3138, 1, 1024] - - [800, 9672.15] + - [862, 9672.15] - - [1024, 3423, 1, 4096] - - [823, 9222.77] + - [885, 9222.77] - - [1024, 3491, 1, 4096] - - [823, 9405.02] + - [885, 9405.02] - - [1024, 3860, 1, 4096] - - [806, 9282.94] + - [868, 9282.94] - - [4096, 3211, 1, 1024] - - [803, 9640.42] + - [865, 9640.42] - - [1024, 3221, 1, 4096] - - [817, 8709.4] + - [879, 8709.4] - - [1024, 2917, 1, 4096] - - [805, 9177.11] + - [867, 9177.11] - - [4096, 3475, 1, 1024] - - [803, 9703.45] + - [865, 9703.45] - - [4096, 3524, 1, 1024] - - [803, 9816.23] + - [865, 9816.23] - - [4096, 2985, 1, 1024] - - [804, 9686.91] + - [866, 9686.91] - - [1024, 3480, 1, 4096] - - [823, 9380.2] + - [885, 9380.2] - - [4096, 3222, 1, 1024] - - [803, 9666.8] + - [865, 9666.8] - - [4096, 3451, 1, 1024] - - [799, 9877.91] + - [861, 9877.91] - - [1024, 3969, 1, 33708] - - [803, 9669.64] + - [865, 9669.64] - - [1024, 3640, 1, 1024] - - [810, 8565.68] + - [872, 8565.68] - - [1024, 3297, 1, 4096] - - [819, 8889.22] + - [881, 8889.22] - - [4096, 3944, 1, 1024] - - [800, 9902.85] + - [862, 9902.85] - - [1024, 3216, 1, 4096] - - [802, 8695.88] + - [864, 8695.88] - - [1024, 3840, 1, 1024] - - [816, 9046.05] + - [878, 9046.05] - - [4096, 3349, 1, 1024] - - [814, 9676.82] + - [876, 9676.82] - - [4096, 3398, 1, 1024] - - [800, 9775.84] + - [862, 9775.84] - - [1024, 3154, 1, 4096] - - [817, 8662.26] + - [879, 8662.26] - - [1024, 3978, 1, 33708] - - [804, 9689.16] + - [866, 9689.16] - - [1024, 3348, 1, 4096] - - [823, 9014.67] + - [885, 9014.67] - - [4096, 3304, 1, 1024] - - [804, 9886.8] + - [866, 9886.8] - - [4096, 4030, 1, 1024] - - [804, 9859.1] + - [866, 9859.1] - - [1024, 4026, 1, 1024] - - [801, 9326.64] + - [863, 9326.64] - - [4096, 3471, 1, 1024] - - [803, 9683.0] + - [865, 9683.0] - - [1024, 3259, 1, 4096] - - [817, 8792.19] + - [879, 8792.19] - - [64, 132, 480, 132] - - [857, 6027.86] + - [919, 6027.86] - - [1024, 3308, 1, 4096] - - [822, 8905.14] + - [884, 8905.14] - - [4096, 3391, 1, 1024] - - [815, 9765.35] + - [877, 9765.35] - - [1024, 3312, 1, 4096] - - [823, 8917.74] + - [885, 8917.74] - - [1024, 3502, 1, 4096] - - [823, 9435.62] + - [885, 9435.62] - - [1024, 3968, 1, 33708] - - [803, 9668.24] + - [865, 9668.24] - - [1024, 3424, 1, 4096] - - [819, 9215.99] + - [881, 9215.99] - - [64, 13, 4672, 13] - - [828, 1662.35] + - [890, 1662.35] - - [4096, 4032, 1, 1024] - - [814, 9877.82] + - [876, 9877.82] - - [1024, 3900, 1, 1024] - - [817, 9116.93] + - [879, 9116.93] - - [4096, 3442, 1, 1024] - - [814, 9773.18] + - [876, 9773.18] - - [1024, 3366, 1, 4096] - - [823, 9079.46] + - [885, 9079.46] - - [4096, 3999, 1, 1024] - - [803, 9786.46] + - [865, 9786.46] - - [1024, 3477, 1, 4096] - - [823, 9364.89] + - [885, 9364.89] - - [1024, 2505, 1, 4096] - - [823, 9304.03] + - [885, 9304.03] - - [4096, 3515, 1, 1024] - - [803, 9797.93] + - [865, 9797.93] - - [1024, 3564, 1, 4096] - - [819, 9632.86] + - [881, 9632.86] - - [4096, 3057, 1, 1024] - - [804, 9880.19] + - [866, 9880.19] - - [1024, 3339, 1, 4096] - - [802, 9029.86] + - [864, 9029.86] - - [4096, 3262, 1, 1024] - - [803, 9780.1] + - [865, 9780.1] - - [1024, 4030, 1, 4096] - - [806, 9682.0] + - [868, 9682.0] - - [1024, 3265, 1, 4096] - - [823, 8797.52] + - [885, 8797.52] - - [1024, 3459, 1, 4096] - - [823, 9313.06] + - [885, 9313.06] - - [4096, 3462, 1, 1024] - - [804, 9669.73] + - [866, 9669.73] - - [64, 85, 752, 85] - - [842, 5186.93] + - [904, 5186.93] - - [1024, 3513, 1, 4096] - - [820, 9469.15] + - [882, 9469.15] - - [1024, 3397, 1, 4096] - - [823, 9151.77] + - [885, 9151.77] - - [4096, 3572, 1, 1024] - - [803, 9945.7] + - [865, 9945.7] - - [4096, 3389, 1, 1024] - - [815, 9740.86] + - [877, 9740.86] - - [4096, 3438, 1, 1024] - - [815, 9822.47] + - [877, 9822.47] - - [64, 102, 624, 100] - - [850, 5487.0] + - [912, 5487.0] - - [1024, 3640, 1, 33708] - - [811, 9083.53] + - [873, 9083.53] - - [1024, 3995, 1, 33708] - - [804, 9731.99] + - [866, 9731.99] - - [1024, 3165, 1, 4096] - - [816, 8601.9] + - [878, 8601.9] - - [4096, 3543, 1, 1024] - - [804, 9868.63] + - [866, 9868.63] - - [4096, 3352, 1, 1024] - - [799, 9668.44] + - [861, 9668.44] - - [1024, 3359, 1, 4096] - - [820, 9050.33] + - [882, 9050.33] - - [1024, 3470, 1, 4096] - - [823, 9355.17] + - [885, 9355.17] - - [64, 15, 4096, 15] - - [827, 1945.43] + - [889, 1945.43] - - [1024, 3392, 1, 4096] - - [822, 9139.71] + - [884, 9139.71] - - [64, 78, 816, 77] - - [834, 4870.56] + - [896, 4870.56] - - [4096, 3137, 1, 1024] - - [799, 9600.22] + - [861, 9600.22] - - [4096, 3506, 1, 1024] - - [804, 9779.08] + - [866, 9779.08] - - [1024, 3095, 1, 4096] - - [816, 8381.24] + - [878, 8381.24] - - [1024, 3859, 1, 4096] - - [803, 9288.63] + - [865, 9288.63] - - [4096, 3369, 1, 1024] - - [815, 9697.73] + - [877, 9697.73] - - [64, 45, 1424, 45] - - [852, 3883.74] + - [914, 3883.74] - - [1024, 3435, 1, 4096] - - [823, 9264.62] + - [885, 9264.62] - - [1024, 3354, 1, 4096] - - [823, 9035.47] + - [885, 9035.47] - - [1024, 3055, 1, 4096] - - [804, 9597.45] + - [866, 9597.45] - - [4096, 3523, 1, 1024] - - [803, 9821.79] + - [865, 9821.79] - - [4096, 3380, 1, 1024] - - [799, 9721.39] + - [861, 9721.39] - - [1024, 3233, 1, 4096] - - [816, 8724.75] + - [878, 8724.75] - - [4096, 3221, 1, 1024] - - [803, 9661.04] + - [865, 9661.04] - - [4096, 3270, 1, 1024] - - [803, 9797.92] + - [865, 9797.92] - - [4096, 3593, 1, 1024] - - [814, 9679.31] + - [876, 9679.31] - - [1024, 3358, 1, 4096] - - [823, 9051.82] + - [885, 9051.82] - - [1024, 3540, 1, 4096] - - [823, 9533.59] + - [885, 9533.59] - - [4096, 3502, 1, 1024] - - [804, 9760.65] + - [866, 9760.65] - - [4096, 2505, 1, 1024] - - [804, 9680.52] + - [866, 9680.52] - - [4096, 3397, 1, 1024] - - [814, 9785.85] + - [876, 9785.85] - - [1024, 3300, 1, 4096] - - [817, 8907.85] + - [879, 8907.85] - - [4096, 3095, 1, 1024] - - [800, 9618.78] + - [862, 9618.78] - - [1024, 3182, 1, 4096] - - [816, 8606.16] + - [878, 8606.16] - - [1024, 3299, 1, 4096] - - [822, 8885.48] + - [884, 8885.48] - - [1024, 3276, 1, 4096] - - [817, 8872.75] + - [879, 8872.75] - - [1024, 3360, 1, 4096] - - [820, 9044.2] + - [882, 9044.2] - - [4096, 3360, 1, 1024] - - [815, 9681.39] + - [877, 9681.39] - - [4096, 2918, 1, 1024] - - [799, 9732.74] + - [861, 9732.74] - - [1024, 3939, 1, 33708] - - [803, 9595.96] + - [865, 9595.96] - - [4096, 3314, 1, 1024] - - [804, 9915.02] + - [866, 9915.02] - - [1024, 3319, 1, 4096] - - [823, 8956.37] + - [885, 8956.37] - - [64, 35, 1808, 35] - - [840, 3060.27] + - [902, 3060.27] - - [1024, 3942, 1, 1024] - - [816, 9211.83] + - [878, 9211.83] - - [1024, 3465, 1, 4096] - - [823, 9340.73] + - [885, 9340.73] - - [4096, 3546, 1, 1024] - - [804, 9875.41] + - [866, 9875.41] - - [1024, 3403, 1, 4096] - - [816, 9224.34] + - [878, 9224.34] - - [1024, 3948, 1, 1024] - - [802, 9245.63] + - [864, 9245.63] - - [4096, 3441, 1, 1024] - - [815, 9758.72] + - [877, 9758.72] - - [1024, 3139, 1, 4096] - - [816, 8582.84] + - [878, 8582.84] - - [1024, 3563, 1, 4096] - - [823, 9620.74] + - [885, 9620.74] - - [1024, 3508, 1, 4096] - - [820, 9449.36] + - [882, 9449.36] - - [1024, 3975, 1, 33708] - - [803, 9683.55] + - [865, 9683.55] - - [1024, 3446, 1, 4096] - - [822, 9289.51] + - [884, 9289.51] - - [1024, 3529, 1, 4096] - - [819, 9491.29] + - [881, 9491.29] - - [64, 112, 576, 112] - - [844, 6387.14] + - [906, 6387.14] - - [4096, 3461, 1, 1024] - - [804, 9663.33] + - [866, 9663.33] - - [1024, 3574, 1, 4096] - - [822, 9662.88] + - [884, 9662.88] - - [1024, 3101, 1, 4096] - - [817, 8468.34] + - [879, 8468.34] - - [1024, 3927, 1, 1024] - - [802, 9207.97] + - [864, 9207.97] - - [4096, 3224, 1, 1024] - - [804, 9665.61] + - [866, 9665.61] - - [4096, 3437, 1, 1024] - - [800, 9857.21] + - [862, 9857.21] - - [4096, 3900, 1, 1024] - - [815, 9826.25] + - [877, 9826.25] - - [1024, 3495, 1, 4096] - - [823, 9412.41] + - [885, 9412.41] - - [1024, 3977, 1, 33708] - - [803, 9687.87] + - [865, 9687.87] - - [1024, 3328, 1, 4096] - - [823, 8975.57] + - [885, 8975.57] - - [4096, 3168, 1, 1024] - - [799, 9754.87] + - [861, 9754.87] - - [1024, 4026, 1, 33708] - - [803, 9807.24] + - [865, 9807.24] - - [1024, 3292, 1, 4096] - - [816, 8901.83] + - [878, 8901.83] - - [1024, 3294, 1, 4096] - - [823, 8877.03] + - [885, 8877.03] - - [4096, 3335, 1, 1024] - - [800, 9616.23] + - [862, 9616.23] - - [4096, 3400, 1, 1024] - - [814, 9710.73] + - [876, 9710.73] - - [1024, 3287, 1, 4096] - - [801, 8908.07] + - [863, 8908.07] - - [1024, 3910, 1, 4096] - - [805, 9401.03] + - [867, 9401.03] - - [1024, 3780, 1, 1024] - - [816, 8863.29] + - [878, 8863.29] - - [4096, 3098, 1, 1024] - - [800, 9606.47] + - [862, 9606.47] - - [1024, 3584, 1, 33708] - - [823, 9775.33] + - [885, 9775.33] - - [64, 29, 2176, 29] - - [845, 3135.03] + - [907, 3135.03] - - [1024, 3371, 1, 4096] - - [801, 9117.81] + - [863, 9117.81] - - [1024, 3546, 1, 4096] - - [823, 9547.3] + - [885, 9547.3] - - [1024, 4012, 1, 1024] - - [805, 9353.73] + - [867, 9353.73] - - [4096, 3505, 1, 1024] - - [803, 9773.17] + - [865, 9773.17] - - [4096, 3554, 1, 1024] - - [803, 9895.59] + - [865, 9895.59] - - [4096, 3063, 1, 1024] - - [803, 9898.98] + - [865, 9898.98] - - [1024, 3900, 1, 33708] - - [804, 9502.93] + - [866, 9502.93] - - [1024, 3345, 1, 4096] - - [823, 9015.85] + - [885, 9015.85] - - [1024, 3357, 1, 4096] - - [823, 9041.23] + - [885, 9041.23] - - [1024, 3282, 1, 4096] - - [816, 8860.17] + - [878, 8860.17] - - [4096, 3484, 1, 1024] - - [804, 9721.33] + - [866, 9721.33] - - [1024, 3557, 1, 4096] - - [820, 9573.48] + - [882, 9573.48] - - [1024, 3476, 1, 4096] - - [823, 9361.72] + - [885, 9361.72] - - [1024, 3751, 1, 1024] - - [817, 8849.11] + - [879, 8849.11] - - [4096, 3379, 1, 1024] - - [800, 9741.49] + - [862, 9741.49] - - [4096, 3428, 1, 1024] - - [799, 9767.82] + - [861, 9767.82] - - [4096, 3126, 1, 1024] - - [814, 9701.9] + - [876, 9701.9] - - [64, 41, 1552, 41] - - [849, 3555.69] + - [911, 3555.69] - - [1024, 3325, 1, 4096] - - [801, 8962.41] + - [863, 8962.41] - - [4096, 3501, 1, 1024] - - [803, 9762.01] + - [865, 9762.01] - - [4096, 3358, 1, 1024] - - [799, 9680.42] + - [861, 9680.42] - - [1024, 3441, 1, 4096] - - [823, 9271.27] + - [885, 9271.27] - - [1024, 3552, 1, 4096] - - [819, 9565.42] + - [881, 9565.42] - - [4096, 3232, 1, 1024] - - [804, 9696.81] + - [866, 9696.81] - - [64, 18, 3440, 18] - - [824, 2059.33] + - [886, 2059.33] - - [1024, 3412, 1, 4096] - - [823, 9199.28] + - [885, 9199.28] - - [1024, 3372, 1, 4096] - - [820, 9083.49] + - [882, 9083.49] - - [1024, 3585, 1, 4096] - - [810, 8710.29] + - [872, 8710.29] - - [4096, 3143, 1, 1024] - - [815, 9692.12] + - [877, 9692.12] - - [4096, 3464, 1, 1024] - - [803, 9661.93] + - [865, 9661.93] - - [1024, 3145, 1, 4096] - - [802, 8526.33] + - [864, 8526.33] - - [4096, 3375, 1, 1024] - - [814, 9734.78] + - [876, 9734.78] - - [4096, 2917, 1, 1024] - - [799, 9714.57] + - [861, 9714.57] - - [4096, 3978, 1, 1024] - - [804, 9741.43] + - [866, 9741.43] - - [1024, 2765, 1, 4096] - - [805, 8706.75] + - [867, 8706.75] - - [64, 148, 432, 148] - - [830, 6372.17] + - [892, 6372.17] - - [1024, 3452, 1, 4096] - - [822, 9301.38] + - [884, 9301.38] - - [4096, 3584, 1, 1024] - - [804, 10005.7] + - [866, 10005.7] - - [4096, 3545, 1, 1024] - - [804, 9877.87] + - [866, 9877.87] - - [1024, 3352, 1, 4096] - - [823, 9035.19] + - [885, 9035.19] - - [64, 159, 400, 160] - - [832, 6952.11] + - [894, 6952.11] - - [4096, 3292, 1, 1024] - - [803, 9856.51] + - [865, 9856.51] - - [1024, 3525, 1, 4096] - - [823, 9501.5] + - [885, 9501.5] - - [1024, 3266, 1, 4096] - - [823, 8817.43] + - [885, 8817.43] - - [1024, 3382, 1, 4096] - - [822, 9101.54] + - [884, 9101.54] - - [4096, 3492, 1, 1024] - - [803, 9747.29] + - [865, 9747.29] - - [4096, 3419, 1, 1024] - - [815, 9745.88] + - [877, 9745.88] - - [1024, 3796, 1, 33708] - - [812, 9356.26] + - [874, 9356.26] - - [1024, 3293, 1, 4096] - - [819, 8868.4] + - [881, 8868.4] - - [4096, 3796, 1, 1024] - - [804, 9885.36] + - [866, 9885.36] - - [1024, 3487, 1, 4096] - - [820, 9391.34] + - [882, 9391.34] - - [4096, 3166, 1, 1024] - - [815, 9718.46] + - [877, 9718.46] - - [64, 102, 624, 101] - - [844, 5547.84] + - [906, 5547.84] - - [1024, 3409, 1, 4096] - - [823, 9187.88] + - [885, 9187.88] - - [1024, 3520, 1, 4096] - - [822, 9485.09] + - [884, 9485.09] - - [1024, 3573, 1, 4096] - - [823, 9652.71] + - [885, 9652.71] - - [4096, 3366, 1, 1024] - - [799, 9684.31] + - [861, 9684.31] - - [4096, 3720, 1, 1024] - - [815, 9703.34] + - [877, 9703.34] - - [4096, 3207, 1, 1024] - - [803, 9626.21] + - [865, 9626.21] - - [4096, 3272, 1, 1024] - - [803, 9795.51] + - [865, 9795.51] - - [1024, 3390, 1, 4096] - - [823, 9125.88] + - [885, 9125.88] - - [4096, 3183, 1, 1024] - - [815, 9825.87] + - [877, 9825.87] - - [4096, 3536, 1, 1024] - - [804, 9846.51] + - [866, 9846.51] - - [4096, 3563, 1, 1024] - - [804, 9913.8] + - [866, 9913.8] - - [1024, 3482, 1, 4096] - - [823, 9376.91] + - [885, 9376.91] - - [4096, 3447, 1, 1024] - - [814, 9875.09] + - [876, 9875.09] - - [4096, 3955, 1, 1024] - - [799, 9922.39] + - [861, 9922.39] - - [4096, 4005, 1, 1024] - - [804, 9803.43] + - [866, 9803.43] - - [1024, 3493, 1, 4096] - - [823, 9411.37] + - [885, 9411.37] - - [4096, 3410, 1, 1024] - - [799, 9788.34] + - [861, 9788.34] - - [1024, 3422, 1, 4096] - - [822, 9216.28] + - [884, 9216.28] - - [1024, 3350, 1, 4096] - - [817, 9068.02] + - [879, 9068.02] - - [4096, 3300, 1, 1024] - - [804, 9883.29] + - [866, 9883.29] - - [4096, 3910, 1, 1024] - - [814, 9800.12] + - [876, 9800.12] - - [1024, 3489, 1, 4096] - - [823, 9398.66] + - [885, 9398.66] - - [4096, 3483, 1, 1024] - - [803, 9715.96] + - [865, 9715.96] - - [4096, 3532, 1, 1024] - - [804, 9837.99] + - [866, 9837.99] - - [64, 101, 624, 101] - - [844, 5452.28] + - [906, 5452.28] - - [4096, 3230, 1, 1024] - - [804, 9683.6] + - [866, 9683.6] - - [4096, 3427, 1, 1024] - - [799, 9760.72] + - [861, 9760.72] - - [1024, 3377, 1, 4096] - - [823, 9101.17] + - [885, 9101.17] - - [1024, 3488, 1, 4096] - - [822, 9381.99] + - [884, 9381.99] - - [1024, 3616, 1, 4096] - - [805, 8709.33] + - [867, 8709.33] - - [1024, 3426, 1, 4096] - - [823, 9229.43] + - [885, 9229.43] - - [4096, 3357, 1, 1024] - - [815, 9668.5] + - [877, 9668.5] - - [4096, 3406, 1, 1024] - - [800, 9748.57] + - [862, 9748.57] - - [1024, 3046, 1, 4096] - - [805, 9590.43] + - [867, 9590.43] - - [1024, 3272, 1, 4096] - - [816, 8930.2] + - [878, 8930.2] - - [1024, 3256, 1, 4096] - - [801, 8828.16] + - [863, 8828.16] - - [4096, 3247, 1, 1024] - - [803, 9741.81] + - [865, 9741.81] - - [4096, 3088, 1, 1024] - - [815, 9589.07] + - [877, 9589.07] - - [1024, 3531, 1, 4096] - - [822, 9501.06] + - [884, 9501.06] - - [64, 160, 400, 160] - - [858, 7334.03] + - [920, 7334.03] - - [4096, 3511, 1, 1024] - - [804, 9789.38] + - [866, 9789.38] - - [1024, 3720, 1, 33708] - - [813, 9214.68] + - [875, 9214.68] - - [1024, 3267, 1, 4096] - - [816, 8831.04] + - [878, 8831.04] - - [1024, 3270, 1, 4096] - - [817, 8876.68] + - [879, 8876.68] - - [1024, 3461, 1, 4096] - - [822, 9327.55] + - [884, 9327.55] - - [4096, 3474, 1, 1024] - - [803, 9697.04] + - [865, 9697.04] - - [4096, 2984, 1, 1024] - - [804, 9674.08] + - [866, 9674.08] - - [1024, 3399, 1, 4096] - - [822, 9158.58] + - [884, 9158.58] - - [4096, 3574, 1, 1024] - - [803, 9942.3] + - [865, 9942.3] - - [1024, 3876, 1, 1024] - - [817, 9085.13] + - [879, 9085.13] - - [4096, 3337, 1, 1024] - - [800, 9611.43] + - [862, 9611.43] - - [4096, 3450, 1, 1024] - - [815, 9930.35] + - [877, 9930.35] - - [1024, 3720, 1, 1024] - - [801, 8755.49] + - [863, 8755.49] - - [1024, 4059, 1, 1024] - - [806, 9366.67] + - [868, 9366.67] - - [4096, 3291, 1, 1024] - - [803, 9856.33] + - [865, 9856.33] - - [64, 93, 688, 93] - - [847, 5497.11] + - [909, 5497.11] - - [4096, 3995, 1, 1024] - - [803, 9776.67] + - [865, 9776.67] - - [64, 147, 432, 147] - - [833, 6233.88] + - [895, 6233.88] - - [4096, 3491, 1, 1024] - - [803, 9742.94] + - [865, 9742.94] - - [4096, 3348, 1, 1024] - - [815, 9634.11] + - [877, 9634.11] - - [4096, 3925, 1, 1024] - - [814, 9848.54] + - [876, 9848.54] - - [4096, 3894, 1, 1024] - - [814, 9812.55] + - [876, 9812.55] - - [1024, 3456, 1, 4096] - - [823, 9317.91] + - [885, 9317.91] - - [1024, 3394, 1, 4096] - - [822, 9148.86] + - [884, 9148.86] - - [64, 100, 624, 102] - - [844, 5416.95] + - [906, 5416.95] - - [4096, 3165, 1, 1024] - - [814, 9743.35] + - [876, 9743.35] - - [4096, 3470, 1, 1024] - - [804, 9691.04] + - [866, 9691.04] - - [1024, 3014, 1, 4096] - - [805, 9486.26] + - [867, 9486.26] - - [1024, 3375, 1, 4096] - - [823, 9082.71] + - [885, 9082.71] - - [4096, 3859, 1, 1024] - - [814, 9738.87] + - [876, 9738.87] - - [4096, 3365, 1, 1024] - - [815, 9694.74] + - [877, 9694.74] - - [1024, 3162, 1, 4096] - - [816, 8550.31] + - [878, 8550.31] - - [1024, 3840, 1, 33708] - - [813, 9409.08] + - [875, 9409.08] - - [1024, 3437, 1, 4096] - - [823, 9270.49] + - [885, 9270.49] - - [4096, 3319, 1, 1024] - - [804, 9927.15] + - [866, 9927.15] - - [1024, 3320, 1, 4096] - - [823, 8962.29] + - [885, 8962.29] - - [64, 23, 2720, 23] - - [846, 2569.53] + - [908, 2569.53] - - [4096, 3328, 1, 1024] - - [803, 9997.41] + - [865, 9997.41] - - [1024, 3235, 1, 4096] - - [823, 8724.31] + - [885, 8724.31] - - [4096, 3282, 1, 1024] - - [804, 9827.13] + - [866, 9827.13] - - [1024, 3367, 1, 4096] - - [816, 9084.02] + - [878, 9084.02] - - [1024, 3542, 1, 4096] - - [823, 9533.1] + - [885, 9533.1] - - [64, 177, 352, 177] - - [809, 6817.91] + - [871, 6817.91] - - [4096, 3145, 1, 1024] - - [800, 9710.28] + - [862, 9710.28] - - [4096, 3514, 1, 1024] - - [803, 9793.06] + - [865, 9793.06] - - [1024, 3432, 1, 4096] - - [823, 9249.39] + - [885, 9249.39] - - [4096, 3409, 1, 1024] - - [799, 9721.6] + - [861, 9721.6] - - [1024, 4012, 1, 33708] - - [803, 9773.35] + - [865, 9773.35] - - [4096, 3876, 1, 1024] - - [800, 9745.65] + - [862, 9745.65] - - [4096, 3299, 1, 1024] - - [803, 9873.53] + - [865, 9873.53] - - [1024, 3168, 1, 4096] - - [816, 8597.13] + - [878, 8597.13] - - [4096, 3681, 1, 1024] - - [815, 9840.03] + - [877, 9840.03] - - [4096, 3531, 1, 1024] - - [804, 9847.76] + - [866, 9847.76] - - [4096, 3388, 1, 1024] - - [815, 9772.28] + - [877, 9772.28] - - [1024, 3720, 1, 4096] - - [804, 8951.6] + - [866, 8951.6] - - [1024, 3332, 1, 4096] - - [823, 8978.97] + - [885, 8978.97] - - [1024, 3273, 1, 4096] - - [817, 8982.49] + - [879, 8982.49] - - [1024, 2935, 1, 4096] - - [806, 9224.89] + - [868, 9224.89] - - [1024, 3467, 1, 4096] - - [820, 9329.33] + - [882, 9329.33] - - [4096, 3542, 1, 1024] - - [803, 9858.51] + - [865, 9858.51] - - [1024, 3130, 1, 4096] - - [802, 8526.66] + - [864, 8526.66] - - [1024, 3405, 1, 4096] - - [823, 9163.44] + - [885, 9163.44] - - [1024, 3960, 1, 1024] - - [801, 9280.36] + - [863, 9280.36] - - [4096, 3405, 1, 1024] - - [814, 9710.2] + - [876, 9710.2] - - [512, 512, 1, 1024] - - [1000, 6670.96] + - [1062, 6670.96] - - [8, 500, 1, 512] - - [896, 228.671] + - [958, 228.671] - - [512, 512, 1, 2000] - - [1033, 7629.44] + - [1095, 7629.44] - - [32, 512, 1, 512] - - [893, 904.045] + - [955, 904.045] - - [100, 1024, 1, 2048] - - [955, 3196.98] + - [1017, 3196.98] - - [8, 512, 1, 500] - - [886, 237.137] + - [948, 237.137] - - [8, 500, 1, 1024] - - [950, 289.366] + - [1012, 289.366] - - [100, 2000, 1, 1024] - - [989, 3368.52] + - [1051, 3368.52] - - [64, 1024, 1, 100] - - [888, 941.709] + - [950, 941.709] - - [64, 1024, 1, 500] - - [1015, 2659.84] + - [1077, 2659.84] - - [64, 1024, 1, 1024] - - [953, 2452.91] + - [1015, 2452.91] - - [128, 2000, 1, 100] - - [1009, 2560.1] + - [1071, 2560.1] - - [2, 500, 1, 2048] - - [950, 72.2127] + - [1012, 72.2127] - - [16, 512, 1, 10] - - [864, 18.3857] + - [926, 18.3857] - - [64, 2000, 1, 1024] - - [1020, 2800.78] + - [1082, 2800.78] - - [100, 1024, 1, 1024] - - [948, 3034.17] + - [1010, 3034.17] - - [8, 512, 1, 10] - - [926, 9.24286] + - [988, 9.24286] - - [16, 500, 1, 2048] - - [950, 565.846] + - [1012, 565.846] - - [10, 100, 1, 500] - - [886, 58.5112] + - [948, 58.5112] - - [16, 100, 1, 10] - - [926, 3.67143] + - [988, 3.67143] - - [500, 1024, 1, 512] - - [1016, 6514.61] + - [1078, 6514.61] - - [128, 1024, 1, 512] - - [1034, 4194.4] + - [1096, 4194.4] - - [512, 500, 1, 2000] - - [992, 7347.98] + - [1054, 7347.98] - - [2, 100, 1, 2000] - - [886, 20.9333] + - [948, 20.9333] - - [500, 512, 1, 100] - - [1008, 2539.78] + - [1070, 2539.78] - - [100, 1024, 1, 500] - - [1034, 3216.18] + - [1096, 3216.18] - - [256, 100, 1, 2048] - - [1044, 1689.17] + - [1106, 1689.17] - - [2, 512, 1, 512] - - [900, 50.5123] + - [962, 50.5123] - - [128, 2000, 1, 512] - - [1020, 4641.46] + - [1082, 4641.46] - - [2, 100, 1, 10] - - [864, 0.496825] + - [926, 0.496825] - - [16, 2000, 1, 2048] - - [908, 1266.25] + - [970, 1266.25] - - [200, 100, 1, 100] - - [1054, 316.556] + - [1116, 316.556] - - [256, 1024, 1, 100] - - [1010, 2686.0] + - [1072, 2686.0] - - [200, 500, 1, 1024] - - [1059, 3282.15] + - [1121, 3282.15] - - [500, 100, 1, 100] - - [973, 631.413] + - [1035, 631.413] - - [4, 100, 1, 10] - - [871, 0.977193] + - [933, 0.977193] - - [32, 100, 1, 512] - - [950, 198.935] + - [1012, 198.935] - - [100, 2000, 1, 512] - - [1020, 3832.44] + - [1082, 3832.44] - - [16, 1024, 1, 512] - - [934, 794.476] + - [996, 794.476] - - [200, 512, 1, 100] - - [1052, 1306.22] + - [1114, 1306.22] - - [4, 1024, 1, 1024] - - [893, 213.225] + - [955, 213.225] - - [512, 1024, 1, 512] - - [1017, 7049.35] + - [1079, 7049.35] - - [4, 512, 1, 10] - - [925, 4.59123] + - [987, 4.59123] - - [2, 2048, 1, 2000] - - [886, 300.393] + - [948, 300.393] - - [64, 2048, 1, 10] - - [1046, 241.041] + - [1108, 241.041] - - [128, 100, 1, 10] - - [1051, 27.6862] + - [1113, 27.6862] - - [4, 512, 1, 2048] - - [886, 146.549] + - [948, 146.549] - - [64, 2048, 1, 500] - - [1026, 4015.79] + - [1088, 4015.79] - - [512, 512, 1, 512] - - [981, 6123.17] + - [1043, 6123.17] - - [500, 500, 1, 2000] - - [992, 7126.67] + - [1054, 7126.67] - - [10, 1024, 1, 2000] - - [959, 807.671] + - [1021, 807.671] - - [256, 100, 1, 100] - - [971, 296.396] + - [1033, 296.396] - - [32, 2000, 1, 2048] - - [914, 2167.3] + - [976, 2167.3] - - [64, 1024, 1, 2048] - - [947, 2383.23] + - [1009, 2383.23] - - [200, 2048, 1, 512] - - [1022, 5264.04] + - [1084, 5264.04] - - [256, 500, 1, 10] - - [1004, 210.626] + - [1066, 210.626] - - [16, 1024, 1, 100] - - [884, 262.664] + - [946, 262.664] - - [32, 1024, 1, 1024] - - [889, 1476.97] + - [951, 1476.97] - - [512, 500, 1, 512] - - [978, 5851.53] + - [1040, 5851.53] - - [128, 1024, 1, 2000] - - [1062, 5516.6] + - [1124, 5516.6] - - [8, 100, 1, 500] - - [886, 46.3963] + - [948, 46.3963] - - [100, 2000, 1, 2048] - - [1041, 3715.63] + - [1103, 3715.63] - - [10, 512, 1, 512] - - [896, 292.671] + - [958, 292.671] - - [8, 500, 1, 10] - - [925, 8.87193] + - [987, 8.87193] - - [10, 2000, 1, 1024] - - [939, 640.1] + - [1001, 640.1] - - [16, 1024, 1, 10] - - [924, 36.6714] + - [986, 36.6714] - - [16, 512, 1, 2048] - - [903, 585.897] + - [965, 585.897] - - [256, 512, 1, 10] - - [969, 230.861] + - [1031, 230.861] - - [2, 2000, 1, 100] - - [931, 64.2026] + - [993, 64.2026] - - [128, 512, 1, 2048] - - [898, 3106.99] + - [960, 3106.99] - - [128, 512, 1, 100] - - [891, 952.658] + - [953, 952.658] - - [512, 2000, 1, 1024] - - [988, 8066.07] + - [1050, 8066.07] - - [64, 500, 1, 2048] - - [1057, 1857.7] + - [1119, 1857.7] - - [64, 2000, 1, 2048] - - [1039, 3442.12] + - [1101, 3442.12] - - [64, 2048, 1, 512] - - [1040, 3315.76] + - [1102, 3315.76] - - [10, 2000, 1, 512] - - [886, 785.376] + - [948, 785.376] - - [32, 2000, 1, 500] - - [889, 2500.1] + - [951, 2500.1] - - [64, 2000, 1, 10] - - [877, 231.984] + - [939, 231.984] - - [500, 100, 1, 10] - - [974, 88.1282] + - [1036, 88.1282] - - [128, 1024, 1, 500] - - [1025, 4096.1] + - [1087, 4096.1] - - [64, 100, 1, 2048] - - [886, 587.34] + - [948, 587.34] - - [64, 100, 1, 10] - - [1045, 12.0403] + - [1107, 12.0403] - - [16, 512, 1, 500] - - [896, 461.361] + - [958, 461.361] - - [32, 2000, 1, 1024] - - [883, 1713.91] + - [945, 1713.91] - - [200, 512, 1, 1024] - - [1062, 3244.46] + - [1124, 3244.46] - - [128, 2048, 1, 10] - - [878, 455.211] + - [940, 455.211] - - [200, 100, 1, 2000] - - [886, 1462.09] + - [948, 1462.09] - - [2, 100, 1, 512] - - [886, 12.5272] + - [948, 12.5272] - - [64, 2048, 1, 100] - - [1052, 1689.17] + - [1114, 1689.17] - - [32, 512, 1, 100] - - [885, 266.074] + - [947, 266.074] - - [16, 512, 1, 1024] - - [950, 569.978] + - [1012, 569.978] - - [4, 1024, 1, 512] - - [940, 208.151] + - [1002, 208.151] - - [64, 2000, 1, 100] - - [1052, 1649.58] + - [1114, 1649.58] - - [512, 2048, 1, 512] - - [988, 7849.09] + - [1050, 7849.09] - - [2, 500, 1, 500] - - [874, 53.5188] + - [936, 53.5188] - - [32, 100, 1, 100] - - [885, 57.2429] + - [947, 57.2429] - - [100, 500, 1, 2000] - - [889, 2784.06] + - [951, 2784.06] - - [200, 2000, 1, 100] - - [961, 2994.11] + - [1023, 2994.11] - - [10, 512, 1, 10] - - [921, 11.1345] + - [983, 11.1345] - - [100, 500, 1, 2048] - - [1061, 2361.72] + - [1123, 2361.72] - - [4, 2048, 1, 500] - - [896, 379.359] + - [958, 379.359] - - [200, 500, 1, 100] - - [1022, 1288.76] + - [1084, 1288.76] - - [500, 500, 1, 500] - - [978, 5425.45] + - [1040, 5425.45] - - [2, 100, 1, 1024] - - [950, 16.3025] + - [1012, 16.3025] - - [128, 2048, 1, 512] - - [1036, 4699.6] + - [1098, 4699.6] - - [200, 2000, 1, 1024] - - [986, 4621.04] + - [1048, 4621.04] - - [32, 512, 1, 1024] - - [949, 1028.12] + - [1011, 1028.12] - - [100, 2048, 1, 500] - - [1010, 4142.49] + - [1072, 4142.49] - - [256, 100, 1, 1024] - - [1040, 1443.62] + - [1102, 1443.62] - - [16, 2000, 1, 500] - - [935, 1428.67] + - [997, 1428.67] - - [128, 100, 1, 100] - - [885, 213.433] + - [947, 213.433] - - [500, 500, 1, 2048] - - [982, 6639.1] + - [1044, 6639.1] - - [32, 512, 1, 10] - - [918, 36.0298] + - [980, 36.0298] - - [128, 100, 1, 1024] - - [946, 791.598] + - [1008, 791.598] - - [16, 500, 1, 2000] - - [959, 694.544] + - [1021, 694.544] - - [4, 2048, 1, 100] - - [930, 129.72] + - [992, 129.72] - - [64, 500, 1, 500] - - [872, 1333.43] + - [934, 1333.43] - - [500, 1024, 1, 2048] - - [991, 7031.86] + - [1053, 7031.86] - - [512, 2048, 1, 100] - - [966, 5285.26] + - [1028, 5285.26] - - [128, 512, 1, 1024] - - [1058, 2519.2] + - [1120, 2519.2] - - [128, 512, 1, 2000] - - [1056, 3608.91] + - [1118, 3608.91] - - [128, 2000, 1, 2000] - - [1029, 7017.64] + - [1091, 7017.64] - - [2, 512, 1, 10] - - [922, 2.13175] + - [984, 2.13175] - - [10, 512, 1, 500] - - [886, 293.678] + - [948, 293.678] - - [4, 1024, 1, 2000] - - [906, 326.215] + - [968, 326.215] - - [256, 100, 1, 2000] - - [1043, 1768.06] + - [1105, 1768.06] - - [512, 2048, 1, 2000] - - [988, 8674.62] + - [1050, 8674.62] - - [100, 100, 1, 10] - - [1050, 21.6517] + - [1112, 21.6517] - - [256, 500, 1, 1024] - - [990, 4833.14] + - [1052, 4833.14] - - [128, 512, 1, 10] - - [878, 132.229] + - [940, 132.229] - - [256, 100, 1, 500] - - [1037, 914.386] + - [1099, 914.386] - - [64, 100, 1, 512] - - [944, 369.109] + - [1006, 369.109] - - [64, 512, 1, 500] - - [886, 1600.1] + - [948, 1600.1] - - [64, 2048, 1, 2000] - - [1040, 5925.6] + - [1102, 5925.6] - - [100, 2048, 1, 1024] - - [998, 3260.6] + - [1060, 3260.6] - - [200, 2000, 1, 10] - - [878, 595.338] + - [940, 595.338] - - [128, 1024, 1, 100] - - [1022, 1689.17] + - [1084, 1689.17] - - [16, 2000, 1, 100] - - [885, 493.927] + - [947, 493.927] - - [8, 100, 1, 512] - - [886, 49.8087] + - [948, 49.8087] - - [500, 2048, 1, 1024] - - [988, 7651.71] + - [1050, 7651.71] - - [500, 2000, 1, 10] - - [976, 1008.16] + - [1038, 1008.16] - - [32, 100, 1, 500] - - [950, 187.016] + - [1012, 187.016] - - [256, 1024, 1, 2048] - - [991, 6190.95] + - [1053, 6190.95] - - [32, 500, 1, 2048] - - [886, 1083.7] + - [948, 1083.7] - - [4, 2000, 1, 10] - - [929, 17.6439] + - [991, 17.6439] - - [128, 500, 1, 2000] - - [946, 3516.58] + - [1008, 3516.58] - - [8, 1024, 1, 10] - - [920, 18.0649] + - [982, 18.0649] - - [2, 500, 1, 100] - - [865, 16.1256] + - [927, 16.1256] - - [10, 500, 1, 512] - - [886, 291.009] + - [948, 291.009] - - [10, 2000, 1, 10] - - [864, 38.5615] + - [926, 38.5615] - - [500, 512, 1, 512] - - [981, 5893.63] + - [1043, 5893.63] - - [32, 500, 1, 500] - - [886, 892.957] + - [948, 892.957] - - [256, 500, 1, 2000] - - [995, 6237.92] + - [1057, 6237.92] - - [100, 500, 1, 100] - - [897, 726.844] + - [959, 726.844] - - [500, 2048, 1, 100] - - [970, 4867.02] + - [1032, 4867.02] - - [10, 1024, 1, 512] - - [886, 520.227] + - [948, 520.227] - - [2, 2048, 1, 512] - - [896, 151.628] + - [958, 151.628] - - [256, 512, 1, 100] - - [975, 1590.78] + - [1037, 1590.78] - - [10, 2048, 1, 100] - - [886, 324.151] + - [948, 324.151] - - [8, 2048, 1, 100] - - [941, 256.1] + - [1003, 256.1] - - [512, 100, 1, 512] - - [1037, 2100.61] + - [1099, 2100.61] - - [4, 500, 1, 500] - - [886, 115.841] + - [948, 115.841] - - [64, 100, 1, 1024] - - [886, 450.21] + - [948, 450.21] - - [2, 2048, 1, 1024] - - [943, 137.708] + - [1005, 137.708] - - [2, 500, 1, 2000] - - [912, 90.3527] + - [974, 90.3527] - - [512, 1024, 1, 500] - - [1017, 6898.63] + - [1079, 6898.63] - - [128, 2000, 1, 500] - - [1022, 5161.39] + - [1084, 5161.39] - - [32, 512, 1, 2048] - - [956, 1103.86] + - [1018, 1103.86] - - [10, 100, 1, 2000] - - [886, 106.032] + - [948, 106.032] - - [4, 100, 1, 512] - - [886, 24.7154] + - [948, 24.7154] - - [2, 512, 1, 2048] - - [950, 73.3246] + - [1012, 73.3246] - - [200, 512, 1, 2048] - - [1062, 3954.01] + - [1124, 3954.01] - - [200, 2000, 1, 2000] - - [1024, 6230.63] + - [1086, 6230.63] - - [100, 100, 1, 2000] - - [886, 827.915] + - [948, 827.915] - - [500, 2048, 1, 2000] - - [987, 8388.04] + - [1049, 8388.04] - - [64, 2048, 1, 2048] - - [1032, 3406.64] + - [1094, 3406.64] - - [16, 2000, 1, 1024] - - [892, 1024.1] + - [954, 1024.1] - - [512, 2048, 1, 1024] - - [965, 8061.22] + - [1027, 8061.22] - - [10, 500, 1, 500] - - [896, 284.191] + - [958, 284.191] - - [200, 1024, 1, 2048] - - [1060, 4886.29] + - [1122, 4886.29] - - [10, 2000, 1, 2000] - - [886, 1449.38] + - [948, 1449.38] - - [8, 2000, 1, 500] - - [935, 719.524] + - [997, 719.524] - - [2, 100, 1, 2048] - - [950, 19.945] + - [1012, 19.945] - - [32, 100, 1, 2048] - - [950, 323.894] + - [1012, 323.894] - - [512, 512, 1, 10] - - [1007, 420.203] + - [1069, 420.203] - - [512, 500, 1, 10] - - [1012, 376.571] + - [1074, 376.571] - - [16, 100, 1, 1024] - - [896, 129.72] + - [958, 129.72] - - [2, 500, 1, 10] - - [860, 2.21864] + - [922, 2.21864] - - [200, 512, 1, 10] - - [862, 188.335] + - [924, 188.335] - - [512, 1024, 1, 100] - - [962, 3877.97] + - [1024, 3877.97] - - [16, 2000, 1, 2000] - - [886, 2222.32] + - [948, 2222.32] - - [500, 500, 1, 1024] - - [982, 6130.37] + - [1044, 6130.37] - - [500, 100, 1, 2048] - - [1037, 2949.41] + - [1099, 2949.41] - - [256, 1024, 1, 512] - - [1001, 5886.84] + - [1063, 5886.84] - - [256, 500, 1, 512] - - [979, 4380.85] + - [1041, 4380.85] - - [16, 1024, 1, 2000] - - [950, 1208.36] + - [1012, 1208.36] - - [200, 500, 1, 2048] - - [1062, 3855.52] + - [1124, 3855.52] - - [256, 2000, 1, 10] - - [964, 727.373] + - [1026, 727.373] - - [10, 2048, 1, 2048] - - [917, 823.158] + - [979, 823.158] - - [512, 2000, 1, 100] - - [966, 5120.1] + - [1028, 5120.1] - - [10, 1024, 1, 1024] - - [893, 553.146] + - [955, 553.146] - - [512, 2000, 1, 2048] - - [994, 7563.4] + - [1056, 7563.4] - - [500, 1024, 1, 500] - - [1018, 6570.94] + - [1080, 6570.94] - - [500, 100, 1, 512] - - [1037, 2038.32] + - [1099, 2038.32] - - [256, 2000, 1, 100] - - [986, 3764.81] + - [1048, 3764.81] - - [512, 1024, 1, 2048] - - [1030, 7286.62] + - [1092, 7286.62] - - [32, 512, 1, 500] - - [886, 898.346] + - [948, 898.346] - - [100, 2000, 1, 10] - - [878, 333.433] + - [940, 333.433] - - [100, 500, 1, 512] - - [1056, 2176.97] + - [1118, 2176.97] - - [8, 2000, 1, 512] - - [935, 602.453] + - [997, 602.453] - - [100, 2048, 1, 2048] - - [1042, 3694.87] + - [1104, 3694.87] - - [128, 1024, 1, 2048] - - [1061, 4168.35] + - [1123, 4168.35] - - [8, 500, 1, 2000] - - [960, 352.213] + - [1022, 352.213] - - [100, 2000, 1, 500] - - [1010, 4045.41] + - [1072, 4045.41] - - [100, 2048, 1, 100] - - [1010, 2081.4] + - [1072, 2081.4] - - [4, 100, 1, 1024] - - [886, 33.1323] + - [948, 33.1323] - - [500, 2048, 1, 2048] - - [994, 7765.03] + - [1056, 7765.03] - - [2, 2000, 1, 2048] - - [905, 166.334] + - [967, 166.334] - - [200, 2048, 1, 10] - - [879, 609.624] + - [941, 609.624] - - [2, 500, 1, 1024] - - [950, 75.3941] + - [1012, 75.3941] - - [100, 500, 1, 1024] - - [946, 1975.41] + - [1008, 1975.41] - - [16, 2048, 1, 500] - - [886, 1473.48] + - [948, 1473.48] - - [100, 1024, 1, 10] - - [1046, 185.607] + - [1108, 185.607] - - [8, 2048, 1, 1024] - - [942, 543.404] + - [1004, 543.404] - - [2, 2000, 1, 500] - - [886, 179.956] + - [948, 179.956] - - [32, 100, 1, 1024] - - [886, 267.812] + - [948, 267.812] - - [500, 2000, 1, 512] - - [1016, 7087.59] + - [1078, 7087.59] - - [64, 100, 1, 2000] - - [896, 615.485] + - [958, 615.485] - - [100, 1024, 1, 2000] - - [1059, 4224.52] + - [1121, 4224.52] - - [64, 500, 1, 10] - - [861, 63.5921] + - [923, 63.5921] - - [32, 2048, 1, 100] - - [882, 941.709] + - [944, 941.709] - - [64, 500, 1, 512] - - [886, 1575.48] + - [948, 1575.48] - - [10, 100, 1, 1024] - - [896, 82.6806] + - [958, 82.6806] - - [16, 512, 1, 100] - - [885, 148.506] + - [947, 148.506] - - [4, 100, 1, 2000] - - [959, 43.9597] + - [1021, 43.9597] - - [2, 512, 1, 1024] - - [950, 74.152] + - [1012, 74.152] - - [64, 512, 1, 1024] - - [951, 1571.0] + - [1013, 1571.0] - - [10, 2048, 1, 500] - - [886, 920.963] + - [948, 920.963] - - [4, 2000, 1, 2048] - - [905, 326.215] + - [967, 326.215] - - [512, 100, 1, 2048] - - [1040, 3084.15] + - [1102, 3084.15] - - [32, 100, 1, 2000] - - [886, 343.448] + - [948, 343.448] - - [256, 512, 1, 500] - - [979, 4311.68] + - [1041, 4311.68] - - [100, 2000, 1, 100] - - [1010, 2016.23] + - [1072, 2016.23] - - [8, 2000, 1, 1024] - - [899, 544.781] + - [961, 544.781] - - [4, 512, 1, 500] - - [886, 118.619] + - [948, 118.619] - - [128, 1024, 1, 10] - - [1049, 244.637] + - [1111, 244.637] - - [4, 500, 1, 1024] - - [886, 144.733] + - [948, 144.733] - - [32, 2048, 1, 512] - - [889, 2140.05] + - [951, 2140.05] - - [32, 100, 1, 10] - - [864, 7.11754] + - [926, 7.11754] - - [100, 2048, 1, 10] - - [1053, 341.433] + - [1115, 341.433] - - [512, 500, 1, 100] - - [1014, 2461.64] + - [1076, 2461.64] - - [128, 2000, 1, 1024] - - [998, 4174.37] + - [1060, 4174.37] - - [200, 1024, 1, 500] - - [1010, 4295.4] + - [1072, 4295.4] - - [32, 2048, 1, 1024] - - [913, 1667.82] + - [975, 1667.82] - - [10, 1024, 1, 2048] - - [904, 555.49] + - [966, 555.49] - - [8, 500, 1, 100] - - [885, 71.5286] + - [947, 71.5286] - - [32, 2048, 1, 500] - - [889, 2528.5] + - [951, 2528.5] - - [200, 100, 1, 1024] - - [898, 1071.23] + - [960, 1071.23] - - [16, 100, 1, 100] - - [875, 28.6714] + - [937, 28.6714] - - [8, 1024, 1, 2000] - - [959, 654.413] + - [1021, 654.413] - - [4, 512, 1, 100] - - [885, 36.6714] + - [947, 36.6714] - - [16, 500, 1, 100] - - [885, 142.957] + - [947, 142.957] - - [8, 1024, 1, 2048] - - [911, 441.606] + - [973, 441.606] - - [16, 1024, 1, 2048] - - [912, 886.845] + - [974, 886.845] - - [10, 2048, 1, 1024] - - [890, 639.476] + - [952, 639.476] - - [64, 512, 1, 100] - - [885, 518.581] + - [947, 518.581] - - [2, 100, 1, 500] - - [886, 9.71538] + - [948, 9.71538] - - [2, 500, 1, 512] - - [892, 48.2203] + - [954, 48.2203] - - [256, 512, 1, 2000] - - [995, 6450.49] + - [1057, 6450.49] - - [128, 500, 1, 1024] - - [889, 2497.66] + - [951, 2497.66] - - [10, 100, 1, 10] - - [926, 2.33214] + - [988, 2.33214] - - [8, 2048, 1, 2048] - - [876, 643.398] + - [938, 643.398] - - [16, 2048, 1, 2048] - - [916, 1338.0] + - [978, 1338.0] - - [64, 1024, 1, 10] - - [879, 132.229] + - [941, 132.229] - - [500, 100, 1, 500] - - [1037, 1941.09] + - [1099, 1941.09] - - [256, 1024, 1, 2000] - - [1033, 7629.44] + - [1095, 7629.44] - - [200, 512, 1, 500] - - [1022, 3232.42] + - [1084, 3232.42] - - [8, 2000, 1, 10] - - [923, 32.3581] + - [985, 32.3581] - - [64, 2000, 1, 512] - - [1021, 3225.3] + - [1083, 3225.3] - - [2, 512, 1, 100] - - [865, 16.7234] + - [927, 16.7234] - - [4, 2000, 1, 2000] - - [886, 586.61] + - [948, 586.61] - - [200, 1024, 1, 100] - - [1010, 2133.43] + - [1072, 2133.43] - - [16, 100, 1, 500] - - [950, 92.6926] + - [1012, 92.6926] - - [128, 100, 1, 500] - - [946, 526.416] + - [1008, 526.416] - - [500, 1024, 1, 1024] - - [980, 7201.86] + - [1042, 7201.86] - - [200, 1024, 1, 1024] - - [1032, 4519.82] + - [1094, 4519.82] - - [8, 2048, 1, 512] - - [896, 624.252] + - [958, 624.252] - - [200, 2000, 1, 500] - - [986, 5186.82] + - [1048, 5186.82] - - [512, 100, 1, 1024] - - [1037, 2742.19] + - [1099, 2742.19] - - [16, 100, 1, 2000] - - [896, 168.876] + - [958, 168.876] - - [500, 512, 1, 2000] - - [1033, 7289.39] + - [1095, 7289.39] - - [8, 2000, 1, 2048] - - [907, 668.289] + - [969, 668.289] - - [256, 2048, 1, 100] - - [968, 3924.41] + - [1030, 3924.41] - - [32, 2048, 1, 2000] - - [900, 3882.56] + - [962, 3882.56] - - [200, 500, 1, 512] - - [1025, 3368.52] + - [1087, 3368.52] - - [10, 512, 1, 100] - - [885, 91.5286] + - [947, 91.5286] - - [16, 2000, 1, 10] - - [863, 61.6385] + - [925, 61.6385] - - [8, 512, 1, 100] - - [885, 72.2127] + - [947, 72.2127] - - [256, 512, 1, 512] - - [990, 4584.04] + - [1052, 4584.04] - - [500, 2000, 1, 1024] - - [965, 7569.59] + - [1027, 7569.59] - - [512, 512, 1, 500] - - [981, 5708.81] + - [1043, 5708.81] - - [256, 2048, 1, 1024] - - [1005, 5923.21] + - [1067, 5923.21] - - [8, 2048, 1, 2000] - - [886, 1153.9] + - [948, 1153.9] - - [100, 512, 1, 2048] - - [952, 2383.23] + - [1014, 2383.23] - - [100, 1024, 1, 512] - - [1037, 3343.77] + - [1099, 3343.77] - - [128, 100, 1, 2000] - - [1055, 1084.85] + - [1117, 1084.85] - - [4, 2048, 1, 2048] - - [904, 332.454] + - [966, 332.454] - - [2, 1024, 1, 2000] - - [915, 161.106] + - [977, 161.106] - - [100, 512, 1, 512] - - [889, 2184.63] + - [951, 2184.63] - - [128, 1024, 1, 1024] - - [1032, 3848.09] + - [1094, 3848.09] - - [200, 2048, 1, 1024] - - [967, 4547.26] + - [1029, 4547.26] - - [32, 1024, 1, 2000] - - [896, 2416.62] + - [958, 2416.62] - - [128, 500, 1, 100] - - [891, 919.64] + - [953, 919.64] - - [200, 512, 1, 2000] - - [1059, 4238.51] + - [1121, 4238.51] - - [10, 2048, 1, 2000] - - [896, 1454.65] + - [958, 1454.65] - - [256, 1024, 1, 500] - - [993, 5669.3] + - [1055, 5669.3] - - [100, 100, 1, 100] - - [885, 171.333] + - [947, 171.333] - - [8, 512, 1, 1024] - - [954, 286.596] + - [1016, 286.596] - - [200, 1024, 1, 512] - - [1010, 4354.65] + - [1072, 4354.65] - - [256, 500, 1, 500] - - [995, 4020.2] + - [1057, 4020.2] - - [200, 100, 1, 500] - - [1059, 702.347] + - [1121, 702.347] - - [2, 1024, 1, 2048] - - [905, 112.85] + - [967, 112.85] - - [256, 500, 1, 2048] - - [995, 5041.33] + - [1057, 5041.33] - - [512, 2048, 1, 500] - - [988, 7710.22] + - [1050, 7710.22] - - [512, 100, 1, 2000] - - [1037, 3099.37] + - [1099, 3099.37] - - [512, 500, 1, 1024] - - [996, 6463.22] + - [1058, 6463.22] - - [16, 512, 1, 2000] - - [912, 721.227] + - [974, 721.227] - - [64, 500, 1, 1024] - - [951, 1528.46] + - [1013, 1528.46] - - [512, 2000, 1, 10] - - [972, 1174.41] + - [1034, 1174.41] - - [256, 512, 1, 1024] - - [990, 4978.5] + - [1052, 4978.5] - - [10, 512, 1, 1024] - - [950, 370.36] + - [1012, 370.36] - - [512, 100, 1, 100] - - [973, 659.894] + - [1035, 659.894] - - [8, 2000, 1, 100] - - [885, 256.51] + - [947, 256.51] - - [128, 2048, 1, 1024] - - [998, 4173.54] + - [1060, 4173.54] - - [2, 2000, 1, 2000] - - [886, 250.727] + - [948, 250.727] - - [16, 2048, 1, 1024] - - [933, 1046.06] + - [995, 1046.06] - - [500, 512, 1, 500] - - [978, 5517.34] + - [1040, 5517.34] - - [8, 100, 1, 1024] - - [951, 64.1] + - [1013, 64.1] - - [10, 100, 1, 100] - - [875, 17.9571] + - [937, 17.9571] - - [200, 500, 1, 500] - - [1025, 3140.8] + - [1087, 3140.8] - - [10, 500, 1, 2000] - - [912, 444.94] + - [974, 444.94] - - [500, 100, 1, 2000] - - [1040, 2969.22] + - [1102, 2969.22] - - [100, 512, 1, 2000] - - [952, 2776.67] + - [1014, 2776.67] - - [500, 1024, 1, 2000] - - [1031, 8020.15] + - [1093, 8020.15] - - [32, 2000, 1, 2000] - - [892, 3827.85] + - [954, 3827.85] - - [64, 1024, 1, 512] - - [1056, 2573.29] + - [1118, 2573.29] - - [64, 2000, 1, 2000] - - [1025, 5797.2] + - [1087, 5797.2] - - [32, 500, 1, 100] - - [885, 266.767] + - [947, 266.767] - - [128, 2000, 1, 2048] - - [1041, 4548.05] + - [1103, 4548.05] - - [10, 100, 1, 2048] - - [950, 98.5615] + - [1012, 98.5615] - - [32, 2048, 1, 2048] - - [913, 2213.45] + - [975, 2213.45] - - [64, 100, 1, 100] - - [886, 96.4855] + - [948, 96.4855] - - [2, 1024, 1, 100] - - [936, 34.6946] + - [998, 34.6946] - - [256, 1024, 1, 10] - - [1006, 425.658] + - [1068, 425.658] - - [256, 1024, 1, 1024] - - [999, 5482.85] + - [1061, 5482.85] - - [64, 500, 1, 2000] - - [886, 2056.66] + - [948, 2056.66] - - [512, 2000, 1, 512] - - [984, 7550.33] + - [1046, 7550.33] - - [8, 512, 1, 512] - - [893, 232.086] + - [955, 232.086] - - [8, 512, 1, 2048] - - [886, 290.564] + - [948, 290.564] - - [100, 100, 1, 1024] - - [1056, 624.49] + - [1118, 624.49] - - [2, 2048, 1, 10] - - [929, 8.92759] + - [991, 8.92759] - - [4, 2048, 1, 512] - - [935, 312.176] + - [997, 312.176] - - [4, 2048, 1, 10] - - [928, 18.0649] + - [990, 18.0649] - - [8, 100, 1, 2000] - - [905, 85.9369] + - [967, 85.9369] - - [2, 1024, 1, 1024] - - [902, 101.314] + - [964, 101.314] - - [16, 2048, 1, 100] - - [886, 518.581] + - [948, 518.581] - - [16, 512, 1, 512] - - [896, 456.003] + - [958, 456.003] - - [32, 500, 1, 512] - - [893, 906.295] + - [955, 906.295] - - [500, 2000, 1, 2000] - - [988, 8143.42] + - [1050, 8143.42] - - [500, 1024, 1, 10] - - [969, 680.951] + - [1031, 680.951] - - [32, 500, 1, 1024] - - [945, 1008.97] + - [1007, 1008.97] - - [32, 500, 1, 10] - - [881, 33.4333] + - [943, 33.4333] - - [500, 500, 1, 10] - - [1010, 367.747] + - [1072, 367.747] - - [4, 2000, 1, 500] - - [896, 370.47] + - [958, 370.47] - - [10, 2000, 1, 500] - - [886, 899.381] + - [948, 899.381] - - [32, 2000, 1, 512] - - [898, 2089.9] + - [960, 2089.9] - - [256, 500, 1, 100] - - [1011, 1495.43] + - [1073, 1495.43] - - [256, 2048, 1, 10] - - [969, 789.69] + - [1031, 789.69] - - [4, 1024, 1, 500] - - [886, 222.709] + - [948, 222.709] - - [256, 512, 1, 2048] - - [995, 5292.6] + - [1057, 5292.6] - - [2, 2000, 1, 1024] - - [933, 137.365] + - [995, 137.365] - - [256, 100, 1, 512] - - [1037, 1085.13] + - [1099, 1085.13] - - [8, 1024, 1, 500] - - [886, 441.479] + - [948, 441.479] - - [256, 2048, 1, 500] - - [1016, 7031.86] + - [1078, 7031.86] - - [256, 2048, 1, 2048] - - [979, 6771.93] + - [1041, 6771.93] - - [2, 2000, 1, 512] - - [940, 159.106] + - [1002, 159.106] - - [256, 2000, 1, 512] - - [983, 6527.59] + - [1045, 6527.59] - - [4, 1024, 1, 100] - - [932, 70.237] + - [994, 70.237] - - [512, 1024, 1, 2000] - - [1017, 8295.8] + - [1079, 8295.8] - - [100, 500, 1, 500] - - [889, 2016.23] + - [951, 2016.23] - - [4, 2048, 1, 1024] - - [937, 285.039] + - [999, 285.039] - - [2, 1024, 1, 500] - - [886, 109.502] + - [948, 109.502] - - [64, 100, 1, 500] - - [886, 296.396] + - [948, 296.396] - - [256, 2000, 1, 2000] - - [994, 8152.97] + - [1056, 8152.97] - - [2, 512, 1, 500] - - [892, 44.8552] + - [954, 44.8552] - - [8, 2048, 1, 500] - - [886, 736.791] + - [948, 736.791] - - [10, 1024, 1, 500] - - [886, 547.109] + - [948, 547.109] - - [4, 2048, 1, 2000] - - [896, 604.23] + - [958, 604.23] - - [200, 1024, 1, 2000] - - [1063, 5400.94] + - [1125, 5400.94] - - [128, 500, 1, 512] - - [1056, 2730.77] + - [1118, 2730.77] - - [10, 500, 1, 2048] - - [950, 359.651] + - [1012, 359.651] - - [256, 2048, 1, 2000] - - [994, 8375.31] + - [1056, 8375.31] - - [8, 2000, 1, 2000] - - [896, 1146.23] + - [958, 1146.23] - - [100, 2048, 1, 512] - - [1019, 3936.2] + - [1081, 3936.2] - - [512, 500, 1, 2048] - - [995, 6756.39] + - [1057, 6756.39] - - [200, 2048, 1, 100] - - [986, 3180.22] + - [1048, 3180.22] - - [128, 512, 1, 512] - - [889, 2872.91] + - [951, 2872.91] - - [200, 2000, 1, 2048] - - [1035, 4818.92] + - [1097, 4818.92] - - [4, 2000, 1, 1024] - - [933, 275.369] + - [995, 275.369] - - [64, 512, 1, 10] - - [1048, 69.5237] + - [1110, 69.5237] - - [32, 500, 1, 2000] - - [915, 1246.21] + - [977, 1246.21] - - [128, 2048, 1, 2000] - - [1028, 7233.65] + - [1090, 7233.65] - - [100, 100, 1, 2048] - - [886, 790.223] + - [948, 790.223] - - [500, 2048, 1, 512] - - [1016, 7249.66] + - [1078, 7249.66] - - [200, 100, 1, 512] - - [892, 748.638] + - [954, 748.638] - - [32, 2000, 1, 100] - - [887, 930.333] + - [949, 930.333] - - [500, 512, 1, 2048] - - [1038, 6640.02] + - [1100, 6640.02] - - [500, 2000, 1, 500] - - [1018, 7078.24] + - [1080, 7078.24] - - [200, 100, 1, 2048] - - [896, 1387.63] + - [958, 1387.63] - - [2, 2048, 1, 100] - - [930, 64.9101] + - [992, 64.9101] - - [8, 100, 1, 10] - - [871, 1.85439] + - [933, 1.85439] - - [200, 2048, 1, 2048] - - [1035, 5022.02] + - [1097, 5022.02] - - [200, 2048, 1, 500] - - [986, 5355.75] + - [1048, 5355.75] - - [100, 100, 1, 500] - - [1056, 416.767] + - [1118, 416.767] - - [8, 2048, 1, 10] - - [927, 34.8119] + - [989, 34.8119] - - [100, 500, 1, 10] - - [867, 93.3836] + - [929, 93.3836] - - [200, 500, 1, 2000] - - [1059, 4152.92] + - [1121, 4152.92] - - [512, 2000, 1, 500] - - [988, 7485.48] + - [1050, 7485.48] - - [10, 500, 1, 1024] - - [954, 363.736] + - [1016, 363.736] - - [256, 100, 1, 10] - - [1003, 41.1256] + - [1065, 41.1256] - - [500, 512, 1, 1024] - - [982, 6362.82] + - [1044, 6362.82] - - [200, 2048, 1, 2000] - - [1024, 6321.09] + - [1086, 6321.09] - - [100, 1024, 1, 100] - - [1023, 1306.22] + - [1085, 1306.22] - - [500, 1024, 1, 100] - - [962, 3699.52] + - [1024, 3699.52] - - [10, 512, 1, 2048] - - [886, 361.18] + - [948, 361.18] - - [2, 1024, 1, 512] - - [935, 105.803] + - [997, 105.803] - - [4, 500, 1, 2048] - - [958, 143.517] + - [1020, 143.517] - - [100, 512, 1, 100] - - [891, 744.286] + - [953, 744.286] - - [16, 500, 1, 512] - - [886, 453.197] + - [948, 453.197] - - [10, 1024, 1, 100] - - [884, 166.334] + - [946, 166.334] - - [8, 1024, 1, 100] - - [932, 140.374] + - [994, 140.374] - - [64, 2000, 1, 500] - - [1027, 3940.99] + - [1089, 3940.99] - - [64, 1024, 1, 2000] - - [892, 3531.13] + - [954, 3531.13] - - [10, 100, 1, 512] - - [886, 61.6385] + - [948, 61.6385] - - [4, 500, 1, 2000] - - [912, 173.11] + - [974, 173.11] - - [512, 1024, 1, 10] - - [963, 736.46] + - [1025, 736.46] - - [128, 2048, 1, 2048] - - [1026, 4596.6] + - [1088, 4596.6] - - [4, 100, 1, 100] - - [875, 7.24286] + - [937, 7.24286] - - [32, 1024, 1, 512] - - [935, 1519.78] + - [997, 1519.78] - - [8, 512, 1, 2000] - - [960, 356.894] + - [1022, 356.894] - - [100, 100, 1, 512] - - [900, 426.767] + - [962, 426.767] - - [2, 2048, 1, 2048] - - [909, 170.878] + - [971, 170.878] - - [2, 512, 1, 2000] - - [912, 90.8801] + - [974, 90.8801] - - [16, 500, 1, 10] - - [885, 18.2818] + - [947, 18.2818] - - [10, 500, 1, 100] - - [885, 88.1282] + - [947, 88.1282] - - [4, 100, 1, 500] - - [950, 23.6849] + - [1012, 23.6849] - - [512, 1024, 1, 1024] - - [1002, 7431.87] + - [1064, 7431.87] - - [64, 500, 1, 100] - - [895, 506.429] + - [957, 506.429] - - [128, 2000, 1, 10] - - [1053, 432.532] + - [1115, 432.532] - - [10, 2000, 1, 2048] - - [916, 806.399] + - [978, 806.399] - - [2, 100, 1, 100] - - [873, 3.225] + - [935, 3.225] - - [10, 512, 1, 2000] - - [905, 462.194] + - [967, 462.194] - - [8, 500, 1, 500] - - [886, 231.581] + - [948, 231.581] - - [4, 500, 1, 512] - - [886, 118.619] + - [948, 118.619] - - [10, 500, 1, 10] - - [880, 11.0649] + - [942, 11.0649] - - [64, 512, 1, 2000] - - [886, 2116.9] + - [948, 2116.9] - - [500, 512, 1, 10] - - [1007, 395.162] + - [1069, 395.162] - - [200, 512, 1, 512] - - [1025, 3449.36] + - [1087, 3449.36] - - [512, 500, 1, 500] - - [981, 5536.43] + - [1043, 5536.43] - - [32, 512, 1, 2000] - - [896, 1264.3] + - [958, 1264.3] - - [128, 500, 1, 2048] - - [952, 3006.34] + - [1014, 3006.34] - - [500, 2048, 1, 10] - - [977, 1049.28] + - [1039, 1049.28] - - [512, 512, 1, 100] - - [1014, 2664.16] + - [1076, 2664.16] - - [200, 2000, 1, 512] - - [1022, 5192.8] + - [1084, 5192.8] - - [500, 500, 1, 512] - - [978, 5673.86] + - [1040, 5673.86] - - [128, 2048, 1, 500] - - [1010, 5251.38] + - [1072, 5251.38] - - [4, 512, 1, 512] - - [886, 123.753] + - [948, 123.753] - - [16, 2048, 1, 2000] - - [902, 2294.78] + - [964, 2294.78] - - [16, 500, 1, 1024] - - [886, 562.737] + - [948, 562.737] - - [256, 2000, 1, 500] - - [1016, 6639.1] + - [1078, 6639.1] - - [10, 1024, 1, 10] - - [866, 21.0836] + - [928, 21.0836] - - [16, 500, 1, 500] - - [886, 446.529] + - [948, 446.529] - - [10, 2048, 1, 512] - - [884, 784.962] + - [946, 784.962] - - [200, 500, 1, 10] - - [859, 176.156] + - [921, 176.156] - - [256, 2048, 1, 512] - - [1013, 6540.93] + - [1075, 6540.93] - - [256, 2000, 1, 2048] - - [990, 6670.43] + - [1052, 6670.43] - - [500, 2048, 1, 500] - - [1018, 7264.57] + - [1080, 7264.57] - - [500, 100, 1, 1024] - - [1040, 2700.52] + - [1102, 2700.52] - - [16, 100, 1, 512] - - [950, 96.7038] + - [1012, 96.7038] - - [64, 512, 1, 2048] - - [951, 1868.39] + - [1013, 1868.39] - - [32, 1024, 1, 10] - - [862, 69.5237] + - [924, 69.5237] - - [16, 2048, 1, 512] - - [935, 1226.5] + - [997, 1226.5] - - [8, 1024, 1, 512] - - [935, 416.202] + - [997, 416.202] - - [4, 1024, 1, 2048] - - [957, 223.201] + - [1019, 223.201] - - [100, 2048, 1, 2000] - - [1030, 5614.14] + - [1092, 5614.14] - - [512, 512, 1, 2048] - - [995, 6868.97] + - [1057, 6868.97] - - [256, 2000, 1, 1024] - - [986, 5758.98] + - [1048, 5758.98] - - [64, 512, 1, 512] - - [1055, 1651.4] + - [1117, 1651.4] - - [200, 1024, 1, 10] - - [869, 341.433] + - [931, 341.433] - - [128, 500, 1, 500] - - [898, 2580.75] + - [960, 2580.75] - - [100, 512, 1, 1024] - - [889, 2041.72] + - [951, 2041.72] - - [16, 1024, 1, 500] - - [886, 867.897] + - [948, 867.897] - - [128, 100, 1, 2048] - - [1056, 1011.46] + - [1118, 1011.46] - - [100, 512, 1, 500] - - [889, 2051.38] + - [951, 2051.38] - - [8, 1024, 1, 1024] - - [902, 424.625] + - [964, 424.625] - - [2, 2000, 1, 10] - - [928, 8.57458] + - [990, 8.57458] - - [4, 500, 1, 10] - - [925, 4.56429] + - [987, 4.56429] - - [500, 2000, 1, 2048] - - [1002, 7444.12] + - [1064, 7444.12] - - [4, 2000, 1, 100] - - [938, 128.305] + - [1000, 128.305] - - [512, 2000, 1, 2000] - - [988, 8454.53] + - [1050, 8454.53] - - [128, 500, 1, 10] - - [1047, 117.747] + - [1109, 117.747] - - [32, 1024, 1, 100] - - [895, 512.1] + - [957, 512.1] - - [8, 500, 1, 2048] - - [910, 286.935] + - [972, 286.935] - - [16, 1024, 1, 1024] - - [874, 881.256] + - [936, 881.256] - - [200, 100, 1, 10] - - [1046, 40.4226] + - [1108, 40.4226] - - [512, 100, 1, 500] - - [1040, 1987.68] + - [1102, 1987.68] - - [512, 2048, 1, 2048] - - [997, 8063.65] + - [1059, 8063.65] - - [16, 2000, 1, 512] - - [896, 1204.81] + - [958, 1204.81] - - [64, 2048, 1, 1024] - - [894, 2853.37] + - [956, 2853.37] - - [32, 2048, 1, 10] - - [868, 130.132] + - [930, 130.132] - - [10, 2048, 1, 10] - - [870, 39.4846] + - [932, 39.4846] - - [4, 2000, 1, 512] - - [886, 316.149] + - [948, 316.149] - - [4, 500, 1, 100] - - [885, 35.8143] + - [947, 35.8143] - - [8, 100, 1, 2048] - - [905, 84.7281] + - [967, 84.7281] - - [512, 2048, 1, 10] - - [985, 1225.07] + - [1047, 1225.07] - - [512, 100, 1, 10] - - [974, 90.2408] + - [1036, 90.2408] - - [4, 512, 1, 1024] - - [886, 143.348] + - [948, 143.348] - - [16, 2048, 1, 10] - - [919, 65.1159] + - [981, 65.1159] - - [500, 2000, 1, 100] - - [970, 4717.08] + - [1032, 4717.08] - - [32, 1024, 1, 2048] - - [913, 1582.86] + - [975, 1582.86] - - [100, 2000, 1, 2000] - - [1030, 5512.78] + - [1092, 5512.78] - - [128, 100, 1, 512] - - [1056, 561.196] + - [1118, 561.196] - - [500, 500, 1, 100] - - [1010, 2460.73] + - [1072, 2460.73] - - [32, 2000, 1, 10] - - [862, 119.503] + - [924, 119.503] - - [128, 2048, 1, 100] - - [1010, 2708.2] + - [1072, 2708.2] - - [10, 2000, 1, 100] - - [885, 316.556] + - [947, 316.556] - - [2, 2048, 1, 500] - - [896, 191.145] + - [958, 191.145] - - [32, 1024, 1, 500] - - [896, 1563.46] + - [958, 1563.46] - - [4, 1024, 1, 10] - - [925, 9.24286] + - [987, 9.24286] - - [100, 512, 1, 10] - - [1051, 97.0697] + - [1113, 97.0697] - - [8, 100, 1, 100] - - [901, 14.3857] + - [963, 14.3857] - - [128, 512, 1, 500] - - [889, 2677.22] + - [951, 2677.22] - - [16, 100, 1, 2048] - - [912, 161.997] + - [974, 161.997] - - [2, 1024, 1, 10] - - [925, 4.59123] + - [987, 4.59123] - - [4, 100, 1, 2048] - - [905, 41.8959] + - [967, 41.8959] - - [4, 512, 1, 2000] - - [905, 180.382] + - [967, 180.382] - - [4096, 64, 1, 2048] - - [1105, 7247.28] + - [1167, 7247.28] - - [1024, 10080, 1, 1024] - - [1093, 9833.47] + - [1155, 9833.47] - - [1024, 1131, 1, 1024] - - [1071, 7551.95] + - [1133, 7551.95] - - [36548, 1216, 1, 1024] - - [1083, 10351.6] + - [1145, 10351.6] - - [1024, 29, 1, 1024] - - [1115, 1697.01] + - [1177, 1697.01] - - [1024, 2592, 1, 1024] - - [1084, 8424.11] + - [1146, 8424.11] - - [1024, 1568, 1, 1024] - - [1095, 7511.86] + - [1157, 7511.86] - - [4096, 91, 1, 2048] - - [1064, 5599.91] + - [1126, 5599.91] - - [1024, 4445, 1, 1024] - - [1082, 9261.22] + - [1144, 9261.22] - - [1024, 6272, 1, 1024] - - [1077, 9439.61] + - [1139, 9439.61] - - [36548, 3584, 1, 1024] - - [1076, 10393.8] + - [1138, 10393.8] - - [1024, 1827, 1, 1024] - - [1095, 8714.42] + - [1157, 8714.42] - - [1024, 3220, 1, 1024] - - [1075, 8861.2] + - [1137, 8861.2] - - [1024, 1856, 1, 1024] - - [1092, 8827.05] + - [1154, 8827.05] - - [1024, 1760, 1, 1024] - - [1092, 8334.2] + - [1154, 8334.2] - - [1024, 1600, 1, 1024] - - [1092, 7615.07] + - [1154, 7615.07] - - [1024, 1, 1, 21] - - [1096, 0.1] + - [1158, 0.1] - - [36548, 4235, 1, 1024] - - [1076, 10276.8] + - [1138, 10276.8] - - [1024, 49, 1, 1024] - - [1111, 2643.12] + - [1173, 2643.12] - - [1024, 1984, 1, 1024] - - [1095, 9449.52] + - [1157, 9449.52] - - [1024, 14720, 1, 1024] - - [1082, 10033.3] + - [1144, 10033.3] - - [1024, 1152, 1, 1024] - - [1065, 7523.54] + - [1127, 7523.54] - - [36548, 14976, 1, 1024] - - [1083, 10421.7] + - [1145, 10421.7] - - [36548, 1152, 1, 1024] - - [1083, 10258.1] + - [1145, 10258.1] - - [4096, 86, 1, 3072] - - [1064, 5308.85] + - [1126, 5308.85] - - [1024, 3392, 1, 1024] - - [1077, 9176.54] + - [1139, 9176.54] - - [1024, 1408, 1, 1024] - - [1077, 8958.83] + - [1139, 8958.83] - - [1024, 2080, 1, 1024] - - [1068, 8396.49] + - [1130, 8396.49] - - [1024, 1824, 1, 1024] - - [1086, 8671.71] + - [1148, 8671.71] - - [36548, 2432, 1, 1024] - - [1076, 10392.6] + - [1138, 10392.6] - - [4096, 29, 1, 2048] - - [1097, 4325.66] + - [1159, 4325.66] - - [1024, 1102, 1, 1024] - - [1071, 7204.18] + - [1133, 7204.18] - - [4096, 49, 1, 2048] - - [1103, 5609.29] + - [1165, 5609.29] - - [36548, 1827, 1, 1024] - - [1083, 10183.2] + - [1145, 10183.2] - - [4096, 25, 1, 2048] - - [1098, 3788.31] + - [1160, 3788.31] - - [1024, 10176, 1, 1024] - - [1093, 9941.18] + - [1155, 9941.18] - - [1024, 774, 1, 1024] - - [1078, 7079.67] + - [1140, 7079.67] - - [1024, 1952, 1, 1024] - - [1095, 9300.49] + - [1157, 9300.49] - - [4096, 128, 1, 2048] - - [1065, 8274.96] + - [1127, 8274.96] - - [1024, 17024, 1, 1024] - - [1075, 9960.72] + - [1137, 9960.72] - - [1024, 1472, 1, 1024] - - [1084, 9343.37] + - [1146, 9343.37] - - [36548, 4459, 1, 1024] - - [1076, 10358.1] + - [1138, 10358.1] - - [4096, 91, 1, 3072] - - [1070, 5509.39] + - [1132, 5509.39] - - [1024, 3712, 1, 1024] - - [1084, 9048.66] + - [1146, 9048.66] - - [4096, 64, 1, 3072] - - [1117, 7489.93] + - [1179, 7489.93] - - [4096, 29, 1, 3072] - - [1097, 4511.78] + - [1159, 4511.78] - - [4096, 128, 1, 3072] - - [1064, 8423.83] + - [1126, 8423.83] - - [36548, 12928, 1, 1024] - - [1083, 10426.1] + - [1145, 10426.1] - - [1024, 1632, 1, 1024] - - [1065, 7761.73] + - [1127, 7761.73] - - [1024, 1696, 1, 1024] - - [1090, 8107.29] + - [1152, 8107.29] - - [4096, 24, 1, 2048] - - [1097, 3663.25] + - [1159, 3663.25] - - [4096, 63, 1, 3072] - - [1106, 7175.37] + - [1168, 7175.37] - - [4096, 96, 1, 2048] - - [1065, 5866.28] + - [1127, 5866.28] - - [36548, 1764, 1, 1024] - - [1076, 10128.5] + - [1138, 10128.5] - - [4096, 32, 1, 2048] - - [1101, 4540.62] + - [1163, 4540.62] - - [1024, 35, 1, 1024] - - [1109, 1911.57] + - [1171, 1911.57] - - [1024, 1120, 1, 1024] - - [1064, 7289.13] + - [1126, 7289.13] - - [4096, 49, 1, 3072] - - [1103, 5751.62] + - [1165, 5751.62] - - [1024, 24, 1, 1024] - - [1109, 1392.02] + - [1171, 1392.02] - - [1024, 2944, 1, 1024] - - [1085, 9284.93] + - [1147, 9284.93] - - [36548, 14080, 1, 1024] - - [1076, 10441.4] + - [1138, 10441.4] - - [1024, 1, 1, 1024] - - [1096, 0.1] + - [1158, 0.1] - - [1024, 1280, 1, 1024] - - [1064, 8244.46] + - [1126, 8244.46] - - [1024, 13440, 1, 1024] - - [1076, 9799.92] + - [1138, 9799.92] - - [1024, 1015, 1, 1024] - - [1084, 9187.85] + - [1146, 9187.85] - - [36548, 9120, 1, 1024] - - [1076, 10400.0] + - [1138, 10400.0] - - [36548, 1, 1, 1024] - - [1096, 0.1] + - [1158, 0.1] - - [1024, 3008, 1, 1024] - - [1085, 9468.55] + - [1147, 9468.55] - - [1024, 2560, 1, 1024] - - [1082, 8879.31] + - [1144, 8879.31] - - [1024, 21, 1, 1024] - - [1108, 1234.41] + - [1170, 1234.41] - - [1024, 2208, 1, 1024] - - [1064, 8231.27] + - [1126, 8231.27] - - [1024, 96, 1, 1024] - - [1114, 3767.44] + - [1176, 3767.44] - - [4096, 86, 1, 2048] - - [1065, 5529.09] + - [1127, 5529.09] - - [4096, 96, 1, 3072] - - [1064, 6273.28] + - [1126, 6273.28] - - [1024, 1920, 1, 1024] - - [1094, 9118.19] + - [1156, 9118.19] - - [4096, 27, 1, 2048] - - [1097, 4073.7] + - [1159, 4073.7] - - [36548, 2496, 1, 1024] - - [1076, 10361.2] + - [1138, 10361.2] - - [1024, 1, 1, 14] - - [1096, 0.1] + - [1158, 0.1] - - [1024, 91, 1, 1024] - - [1116, 3647.67] + - [1178, 3647.67] - - [1024, 2016, 1, 1024] - - [1092, 9560.24] + - [1154, 9560.24] - - [1024, 1184, 1, 1024] - - [1065, 7678.96] + - [1127, 7678.96] - - [4096, 1, 1, 2048] - - [1096, 0.1] + - [1158, 0.1] - - [1024, 1664, 1, 1024] - - [1090, 7934.07] + - [1152, 7934.07] - - [1024, 11424, 1, 1024] - - [1082, 9777.91] + - [1144, 9777.91] - - [4096, 24, 1, 3072] - - [1100, 3813.1] + - [1162, 3813.1] - - [1024, 1216, 1, 1024] - - [1064, 7902.13] + - [1126, 7902.13] - - [36548, 3185, 1, 1024] - - [1076, 10336.7] + - [1138, 10336.7] - - [36548, 9216, 1, 1024] - - [1076, 10414.3] + - [1138, 10414.3] - - [1024, 3200, 1, 1024] - - [1082, 8847.01] + - [1144, 8847.01] - - [1024, 2656, 1, 1024] - - [1077, 8649.25] + - [1139, 8649.25] - - [1024, 2368, 1, 1024] - - [1077, 8873.16] + - [1139, 8873.16] - - [1024, 4459, 1, 1024] - - [1084, 9431.32] + - [1146, 9431.32] - - [1024, 3808, 1, 1024] - - [1084, 9263.72] + - [1146, 9263.72] - - [1024, 2336, 1, 1024] - - [1077, 8966.0] + - [1139, 8966.0] - - [4096, 27, 1, 3072] - - [1097, 4171.74] + - [1159, 4171.74] - - [1024, 2304, 1, 1024] - - [1074, 8601.38] + - [1136, 8601.38] - - [1024, 1560, 1, 1024] - - [1089, 7481.74] + - [1151, 7481.74] - - [4096, 35, 1, 3072] - - [1103, 4176.9] + - [1165, 4176.9] - - [1024, 2496, 1, 1024] - - [1080, 9092.86] + - [1142, 9092.86] - - [1024, 1504, 1, 1024] - - [1080, 9220.53] + - [1142, 9220.53] - - [4096, 50, 1, 2048] - - [1104, 5472.83] + - [1166, 5472.83] - - [1024, 3232, 1, 1024] - - [1077, 8961.94] + - [1139, 8961.94] - - [1024, 14, 1, 1024] - - [1108, 882.315] + - [1170, 882.315] - - [36548, 1015, 1, 1024] - - [1076, 10140.9] + - [1138, 10140.9] - - [1024, 2000, 1, 1024] - - [1088, 9487.8] + - [1150, 9487.8] - - [36548, 243, 1, 1024] - - [1081, 9441.12] + - [1143, 9441.12] - - [36548, 32, 1, 1024] - - [1069, 4721.05] + - [1131, 4721.05] - - [1024, 25, 1, 1024] - - [1115, 1462.96] + - [1177, 1462.96] - - [1024, 13184, 1, 1024] - - [1079, 9866.28] + - [1141, 9866.28] - - [1024, 2688, 1, 1024] - - [1074, 8559.93] + - [1136, 8559.93] - - [1024, 27, 1, 1024] - - [1113, 1559.11] + - [1175, 1559.11] - - [36548, 950, 1, 1024] - - [1083, 10053.6] + - [1145, 10053.6] - - [1024, 1764, 1, 1024] - - [1090, 8347.11] + - [1152, 8347.11] - - [1024, 992, 1, 1024] - - [1077, 9035.82] + - [1139, 9035.82] - - [1024, 1376, 1, 1024] - - [1077, 8797.96] + - [1139, 8797.96] - - [1024, 950, 1, 1024] - - [1084, 8635.26] + - [1146, 8635.26] - - [36548, 774, 1, 1024] - - [1076, 9460.82] + - [1138, 9460.82] - - [36548, 25, 1, 1024] - - [1069, 3694.16] + - [1131, 3694.16] - - [1024, 4256, 1, 1024] - - [1077, 9172.16] + - [1139, 9172.16] - - [4096, 32, 1, 3072] - - [1098, 4886.67] + - [1160, 4886.67] - - [1024, 243, 1, 1024] - - [1102, 6594.41] + - [1164, 6594.41] - - [36548, 3712, 1, 1024] - - [1076, 10401.6] + - [1138, 10401.6] - - [1024, 50, 1, 1024] - - [1111, 2742.19] + - [1173, 2742.19] - - [1024, 3360, 1, 1024] - - [1073, 9017.37] + - [1135, 9017.37] - - [1024, 2048, 1, 1024] - - [1088, 9736.65] + - [1150, 9736.65] - - [1024, 2784, 1, 1024] - - [1084, 8835.6] + - [1146, 8835.6] - - [1024, 4992, 1, 1024] - - [1082, 9639.38] + - [1144, 9639.38] - - [36548, 1102, 1, 1024] - - [1083, 9859.04] + - [1145, 9859.04] - - [1024, 1536, 1, 1024] - - [1075, 9294.98] + - [1137, 9294.98] - - [1024, 2720, 1, 1024] - - [1080, 8617.88] + - [1142, 8617.88] - - [4096, 1, 1, 3072] - - [1096, 0.1] + - [1158, 0.1] - - [1024, 2752, 1, 1024] - - [1084, 8902.17] + - [1146, 8902.17] - - [1024, 2816, 1, 1024] - - [1082, 8906.95] + - [1144, 8906.95] - - [1024, 2624, 1, 1024] - - [1084, 8494.41] + - [1146, 8494.41] - - [1024, 2144, 1, 1024] - - [1067, 8243.56] + - [1129, 8243.56] - - [36548, 1131, 1, 1024] - - [1083, 10104.6] + - [1145, 10104.6] - - [4096, 25, 1, 3072] - - [1098, 3959.98] + - [1160, 3959.98] - - [1024, 64, 1, 1024] - - [1111, 3410.1] + - [1173, 3410.1] - - [1024, 3296, 1, 1024] - - [1082, 9066.52] + - [1144, 9066.52] - - [36548, 4992, 1, 1024] - - [1076, 10395.6] + - [1138, 10395.6] - - [1024, 1344, 1, 1024] - - [1077, 8522.66] + - [1139, 8522.66] - - [36548, 2401, 1, 1024] - - [1076, 10250.3] + - [1138, 10250.3] - - [1024, 15744, 1, 1024] - - [1076, 10006.4] + - [1138, 10006.4] - - [1024, 15232, 1, 1024] - - [1075, 9912.21] + - [1137, 9912.21] - - [1024, 1888, 1, 1024] - - [1087, 8962.98] + - [1149, 8962.98] - - [1024, 1792, 1, 1024] - - [1091, 8556.82] + - [1153, 8556.82] - - [36548, 1073, 1, 1024] - - [1076, 10161.2] + - [1138, 10161.2] - - [4096, 50, 1, 3072] - - [1103, 5882.16] + - [1165, 5882.16] - - [36548, 15488, 1, 1024] - - [1083, 10437.1] + - [1145, 10437.1] - - [1024, 2464, 1, 1024] - - [1080, 8880.02] + - [1142, 8880.02] - - [1024, 2272, 1, 1024] - - [1077, 8720.35] + - [1139, 8720.35] - - [1024, 13, 1, 1024] - - [1107, 774.616] + - [1169, 774.616] - - [1024, 2432, 1, 1024] - - [1082, 8491.53] + - [1144, 8491.53] - - [36548, 24, 1, 1024] - - [1069, 3564.41] + - [1131, 3564.41] - - [1024, 3936, 1, 1024] - - [1092, 9433.3] + - [1154, 9433.3] - - [36548, 13824, 1, 1024] - - [1076, 10439.8] + - [1138, 10439.8] - - [1024, 2401, 1, 1024] - - [1084, 8870.03] + - [1146, 8870.03] - - [1024, 32, 1, 1024] - - [1099, 1839.71] + - [1161, 1839.71] - - [1024, 2176, 1, 1024] - - [1068, 8544.55] + - [1130, 8544.55] - - [1024, 2240, 1, 1024] - - [1077, 8381.55] + - [1139, 8381.55] - - [1024, 1728, 1, 1024] - - [1065, 8212.33] + - [1127, 8212.33] - - [1024, 128, 1, 1024] - - [1112, 4660.44] + - [1174, 4660.44] - - [1024, 216, 1, 1024] - - [1102, 5777.97] + - [1164, 5777.97] - - [1024, 63, 1, 1024] - - [1110, 3329.75] + - [1172, 3329.75] - - [1024, 86, 1, 1024] - - [1116, 3533.7] + - [1178, 3533.7] - - [1024, 2528, 1, 1024] - - [1072, 8789.25] + - [1134, 8789.25] - - [1024, 2400, 1, 1024] - - [1077, 8939.4] + - [1139, 8939.4] - - [1024, 1440, 1, 1024] - - [1084, 9131.41] + - [1146, 9131.41] - - [1024, 2912, 1, 1024] - - [1077, 9140.03] + - [1139, 9140.03] - - [4096, 35, 1, 2048] - - [1103, 4059.85] + - [1165, 4059.85] - - [4096, 63, 1, 2048] - - [1105, 6946.5] + - [1167, 6946.5] - - [1024, 2880, 1, 1024] - - [1075, 9104.98] + - [1137, 9104.98] - - [1024, 4064, 1, 1024] - - [1094, 9715.2] + - [1156, 9715.2] - - [1024, 4655, 1, 1024] - - [1082, 9033.9] + - [1144, 9033.9] - - [1024, 1088, 1, 1024] - - [1066, 8144.41] + - [1128, 8144.41] - - [36548, 6272, 1, 1024] - - [1083, 10427.4] + - [1145, 10427.4] - - [1024, 1, 1, 13] - - [1096, 0.1] + - [1158, 0.1] + - - [768, 512, 1, 768] + - [1182, 5889.14] + - - [768, 2048, 1, 3072] + - [1192, 9394.72] + - - [768, 32, 1, 768] + - [1204, 1502.84] + - - [64, 128, 96, 128] + - [1199, 4973.58] + - - [3072, 1024, 1, 768] + - [1193, 9856.17] + - - [768, 1024, 1, 3072] + - [1186, 8611.16] + - - [768, 512, 1, 3072] + - [1185, 6430.89] + - - [768, 64, 1, 768] + - [1206, 2621.54] + - - [768, 4096, 1, 3072] + - [1191, 10030.5] + - - [768, 2048, 1, 2] + - [1184, 381.863] + - - [768, 2048, 1, 768] + - [1189, 9754.3] + - - [768, 320, 1, 30522] + - [1202, 8529.5] + - - [64, 64, 96, 64] + - [1196, 2496.71] + - - [768, 640, 1, 30522] + - [1183, 8253.94] + - - [768, 1280, 1, 30522] + - [1188, 9572.95] + - - [768, 1280, 1, 768] + - [1192, 8714.03] + - - [768, 640, 1, 768] + - [1182, 7293.13] + - - [768, 32, 1, 2] + - [1194, 11.9154] + - - [3072, 2048, 1, 768] + - [1189, 10019.7] + - - [768, 4096, 1, 768] + - [1189, 9927.45] + - - [3072, 4096, 1, 768] + - [1192, 10150.2] + - - [64, 256, 192, 256] + - [1198, 7054.29] + - - [768, 8, 1, 768] + - [1205, 341.039] + - - [64, 128, 384, 128] + - [1197, 6765.11] + - - [768, 1024, 1, 768] + - [1187, 8768.68] + - - [768, 320, 1, 768] + - [1203, 6838.64] + - - [64, 64, 768, 64] + - [1200, 5388.93] + - - [768, 1024, 1, 2] + - [1180, 258.795] + - - [768, 16, 1, 768] + - [1205, 819.3] + - - [64, 256, 96, 256] + - [1198, 5893.74] + - - [3072, 512, 1, 768] + - [1190, 9722.89] + - - [768, 160, 1, 768] + - [1207, 5019.88] + - - [768, 4096, 1, 2] + - [1181, 507.475] + - - [1600, 512, 1, 1024] + - [1211, 7187.05] + - - [1024, 512, 1, 64] + - [1209, 2557.6] + - - [1024, 512, 1, 1] + - [1208, 71.3348] + - - [2048, 512, 1, 1] + - [1210, 90.4945] + - - [1024, 200, 1, 1] + - [1216, 40.1] + - - [32, 200, 1, 1] + - [1212, 1.66863] + - - [560, 200, 1, 1024] + - [1220, 4731.45] + - - [1, 512, 1, 1] + - [1219, 0.230612] + - - [64, 512, 1, 1] + - [1214, 7.68519] + - - [1024, 8192, 1, 256] + - [1229, 9519.09] + - - [1024, 22016, 1, 256] + - [1235, 9881.22] + - - [256, 8976, 1, 4352] + - [1227, 9567.18] + - - [512, 256, 1, 2048] + - [1240, 5917.99] + - - [1024, 19968, 1, 256] + - [1235, 9882.47] + - - [256, 8976, 1, 1536] + - [1225, 8437.45] + - - [256, 8976, 1, 33536] + - [1225, 8441.99] + - - [1024, 1792, 1, 256] + - [1225, 7757.07] + - - [1024, 21504, 1, 256] + - [1235, 9894.0] + - - [512, 215, 1, 2048] + - [1241, 4665.74] + - - [1024, 7168, 1, 256] + - [1229, 9509.45] + - - [256, 8976, 1, 15872] + - [1231, 8914.75] + - - [1024, 19712, 1, 256] + - [1235, 9772.0] + - - [256, 8976, 1, 5632] + - [1231, 8740.13] + - - [1024, 14848, 1, 256] + - [1235, 9756.25] + - - [1024, 28672, 1, 256] + - [1235, 9959.02] + - - [256, 8976, 1, 9728] + - [1238, 8853.14] + - - [1024, 17152, 1, 256] + - [1229, 9737.4] + - - [256, 8976, 1, 11520] + - [1231, 8999.3] + - - [256, 8976, 1, 8192] + - [1221, 7897.42] + - - [1024, 3328, 1, 256] + - [1236, 8593.63] + - - [256, 8976, 1, 7424] + - [1231, 8980.57] + - - [1024, 18944, 1, 256] + - [1235, 9854.95] + - - [1024, 10496, 1, 256] + - [1230, 9454.0] + - - [256, 8976, 1, 5376] + - [1228, 9608.47] + - - [256, 8976, 1, 6144] + - [1225, 7880.23] + - - [1024, 40448, 1, 256] + - [1235, 10016.7] + - - [256, 8976, 1, 22016] + - [1238, 8939.97] + - - [256, 8976, 1, 4864] + - [1226, 9211.53] + - - [256, 8976, 1, 12288] + - [1222, 8065.15] + - - [1024, 9728, 1, 256] + - [1235, 9636.35] + - - [256, 8976, 1, 2048] + - [1223, 7001.43] + - - [1024, 10240, 1, 256] + - [1229, 9620.06] + - - [256, 8976, 1, 2304] + - [1227, 9509.84] + - - [1024, 7936, 1, 256] + - [1235, 9300.77] + - - [768, 256, 1, 2048] + - [1239, 6268.05] + - - [1024, 9984, 1, 256] + - [1235, 9477.38] + - - [1024, 13312, 1, 256] + - [1235, 9758.66] + - - [1024, 16128, 1, 256] + - [1229, 9722.0] + - - [1024, 8960, 1, 256] + - [1230, 9398.35] + - - [1024, 5120, 1, 256] + - [1236, 9315.6] + - - [1024, 11264, 1, 256] + - [1229, 9664.9] + - - [256, 8976, 1, 20480] + - [1237, 8279.97] + - - [1024, 20992, 1, 256] + - [1229, 9878.97] + - - [256, 8976, 1, 9472] + - [1231, 8991.06] + - - [256, 8976, 1, 8448] + - [1231, 8983.62] + - - [256, 8976, 1, 20992] + - [1232, 8942.21] + - - [256, 8976, 1, 10496] + - [1232, 8989.81] + - - [1024, 15104, 1, 256] + - [1230, 9676.11] + - - [1024, 6400, 1, 256] + - [1238, 9145.99] + - - [1024, 4096, 1, 256] + - [1231, 9124.35] + - - [256, 8976, 1, 2560] + - [1225, 8566.21] + - - [256, 8976, 1, 2816] + - [1227, 9496.94] + - - [1024, 7680, 1, 256] + - [1235, 9460.94] + - - [256, 8976, 1, 14336] + - [1232, 8226.9] + - - [256, 8976, 1, 6656] + - [1232, 8771.52] + - - [1024, 3072, 1, 256] + - [1232, 9077.04] + - - [256, 8976, 1, 5888] + - [1228, 9546.4] + - - [1024, 12288, 1, 256] + - [1229, 9690.91] + - - [256, 8976, 1, 26112] + - [1234, 8699.93] + - - [1024, 7424, 1, 256] + - [1236, 9256.94] + - - [256, 8976, 1, 14848] + - [1237, 8885.89] + - - [768, 215, 1, 2048] + - [1239, 5628.69] + - - [1024, 2560, 1, 256] + - [1232, 8820.93] + - - [256, 8976, 1, 19968] + - [1231, 8928.96] + - - [256, 8976, 1, 9984] + - [1231, 8993.22] + - - [1024, 4864, 1, 256] + - [1232, 8974.4] + - - [1024, 33536, 1, 256] + - [1235, 9943.17] + - - [256, 8976, 1, 15104] + - [1232, 8996.73] + - - [1024, 2048, 1, 256] + - [1230, 8462.76] + - - [256, 8976, 1, 8960] + - [1232, 8999.02] + - - [1024, 6144, 1, 256] + - [1237, 9359.77] + - - [1024, 14592, 1, 256] + - [1235, 9667.52] + - - [256, 8976, 1, 19712] + - [1231, 9020.21] + - - [1024, 11520, 1, 256] + - [1230, 9527.8] + - - [1024, 5632, 1, 256] + - [1229, 9297.3] + - - [256, 8976, 1, 11008] + - [1238, 8994.9] + - - [256, 8976, 1, 17152] + - [1232, 9003.9] + - - [256, 8976, 1, 3072] + - [1221, 8262.06] + - - [1024, 3840, 1, 256] + - [1238, 8671.99] + - - [1024, 14336, 1, 256] + - [1235, 9760.38] + - - [1024, 20480, 1, 256] + - [1229, 9887.95] + - - [1024, 23552, 1, 256] + - [1229, 9890.56] + - - [256, 8976, 1, 7168] + - [1224, 8478.44] + - - [1024, 13568, 1, 256] + - [1229, 9654.74] + - - [1024, 4608, 1, 256] + - [1237, 9218.35] + - - [256, 8976, 1, 10240] + - [1222, 8076.26] + - - [1024, 8704, 1, 256] + - [1231, 9475.6] + - - [1024, 11008, 1, 256] + - [1235, 9525.06] + - - [1024, 8448, 1, 256] + - [1229, 9352.26] + - - [256, 8976, 1, 44505] + - [1233, 8430.33] - null diff --git a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Alik_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Alik_Bljk_SB.yaml index 1eac3ee28..eb99e9a3c 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Alik_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Alik_Bljk_SB.yaml @@ -39633,8 +39633,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -39797,8 +39797,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -39961,8 +39961,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40125,8 +40125,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40289,8 +40289,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40453,8 +40453,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40617,8 +40617,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40781,8 +40781,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40945,8 +40945,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41109,8 +41109,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41273,8 +41273,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41437,8 +41437,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41601,8 +41601,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41765,8 +41765,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41925,8 +41925,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42089,8 +42089,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42253,8 +42253,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42417,8 +42417,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42581,8 +42581,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42745,8 +42745,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42909,8 +42909,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43073,8 +43073,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43237,8 +43237,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43401,8 +43401,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43566,8 +43566,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43733,8 +43733,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43898,8 +43898,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44061,8 +44061,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44226,8 +44226,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44393,8 +44393,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44558,8 +44558,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44721,8 +44721,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44886,8 +44886,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45053,8 +45053,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45218,8 +45218,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45381,8 +45381,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45546,8 +45546,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45713,8 +45713,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45878,8 +45878,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46041,8 +46041,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46206,8 +46206,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46371,8 +46371,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46538,8 +46538,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46703,8 +46703,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46868,8 +46868,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47033,8 +47033,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47198,8 +47198,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47361,8 +47361,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47526,8 +47526,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47693,8 +47693,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47858,8 +47858,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48021,8 +48021,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48186,8 +48186,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48353,8 +48353,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48518,8 +48518,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48681,8 +48681,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48848,8 +48848,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49011,8 +49011,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49178,8 +49178,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49341,8 +49341,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49502,8 +49502,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49665,8 +49665,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49826,8 +49826,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49987,8 +49987,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50146,8 +50146,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50309,8 +50309,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50468,8 +50468,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50631,8 +50631,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50790,8 +50790,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50953,8 +50953,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51112,8 +51112,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51275,8 +51275,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51434,8 +51434,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51597,8 +51597,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51758,8 +51758,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51917,8 +51917,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52080,8 +52080,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52239,8 +52239,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52400,8 +52400,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52561,8 +52561,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52728,8 +52728,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52897,8 +52897,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -53064,8 +53064,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -53229,8 +53229,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -53396,8 +53396,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -53445,24 +53445,24 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -53470,32 +53470,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2304 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53504,9 +53509,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53514,26 +53519,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53543,6 +53556,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -53552,6 +53566,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53566,39 +53581,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 341 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW02_GSU32_SNLL0_TT04_02_VW02_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: [4, 2] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -53606,56 +53629,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53663,19 +53687,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -53683,6 +53714,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53692,6 +53724,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -53701,6 +53734,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53715,39 +53749,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 342 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_02_08 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id001 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 2, 8] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -53764,32 +53806,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 5120 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetB: 4096 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53798,9 +53841,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53808,26 +53851,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53837,6 +53888,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -53846,6 +53898,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53860,45 +53913,53 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 343 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id002 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -53909,36 +53970,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 13312 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53947,9 +54005,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53957,26 +54015,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53986,6 +54052,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -53995,6 +54062,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54009,33 +54077,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 344 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -54048,40 +54124,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 12 - LSPB: 12 - LVCA: 16 - LVCB: 16 - LVPA: 12 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 768 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -54095,10 +54172,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 36 - MacroTile1: 48 - MacroTileA: 36 - MacroTileB: 48 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54106,19 +54183,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -54126,6 +54208,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54135,6 +54218,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54144,6 +54228,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54158,33 +54243,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 345 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT036x048x16_GRVW01_GSU02_SNLL0_TT03_03_VW01_WG12_16_01 - SubGroup0: 12 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 12 + SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id004 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [12, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -54196,58 +54291,55 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 12 - LSPB: 12 - LVCA: 16 - LVCB: 16 - LVPA: 12 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 3456 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 36 - MacroTileA: 48 - MacroTileB: 36 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54255,26 +54347,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54284,6 +54382,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54293,6 +54392,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54307,33 +54407,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 346 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x036x16_GRVW01_GSU08_SNLL0_TT06_03_VW01_WG08_12_02 - SubGroup0: 8 - SubGroup1: 12 - SubGroupA: 8 - SubGroupB: 12 - SuppresssNoLoadLoop: false - ThreadTile: [6, 3] - ThreadTile0: 6 - ThreadTile1: 3 - ThreadTileA: 6 - ThreadTileB: 3 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id003 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -54346,8 +54456,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -54355,31 +54465,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 24 - LSPB: 24 - LVCA: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 LVCB: 8 - LVPA: 12 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 4608 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -54393,10 +54504,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 48 - MacroTileA: 48 - MacroTileB: 48 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54404,19 +54515,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 6 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -54424,6 +54540,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54433,6 +54550,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54442,6 +54560,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54456,39 +54575,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 347 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW02_GSU08_SNLL0_TT06_04_VW02_WG08_12_02 - SubGroup0: 8 - SubGroup1: 12 - SubGroupA: 8 - SubGroupB: 12 - SuppresssNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id003 + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -54505,47 +54634,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 48 - MacroTile1: 48 - MacroTileA: 48 - MacroTileB: 48 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54553,19 +54683,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -54573,6 +54710,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54582,6 +54720,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54591,6 +54730,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54605,85 +54745,94 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 348 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW01_GSU08_SNLL0_TT03_03_VW01_WG16_16_01 - SubGroup0: 16 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_LPA0_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id004 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 - LSPA: 16 - LSPB: 8 - LVCA: 4 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 LVPA: 8 LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 832 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -54691,10 +54840,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54702,19 +54851,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -54722,6 +54878,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54731,6 +54888,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54740,6 +54898,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54754,48 +54913,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 349 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id009 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -54806,33 +54973,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 16 - LVCA: 4 - LVCB: 4 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 8 LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -54840,10 +55008,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54851,19 +55019,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -54871,6 +55046,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54880,6 +55056,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54889,6 +55066,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54903,85 +55081,94 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 350 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 - LSPA: 16 - LSPB: 16 - LVCA: 4 - LVCB: 4 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 LVPA: 8 LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -54990,9 +55177,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55000,19 +55187,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -55020,6 +55214,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55029,6 +55224,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55038,6 +55234,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55052,96 +55249,105 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 351 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55149,19 +55355,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -55169,6 +55380,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55178,6 +55390,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55187,6 +55400,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55201,46 +55415,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 352 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x16_SE_EPS1_FL1_GRVW2_LPA0_LPB0_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id006 + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -55249,44 +55473,45 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 16 - LVCA: 4 - LVCB: 8 + LVCA: 8 + LVCB: 16 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 MacroTile1: 16 MacroTileA: 32 @@ -55298,19 +55523,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -55318,6 +55550,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55327,6 +55560,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55336,6 +55570,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55350,35 +55585,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 353 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -55388,9 +55631,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -55398,31 +55641,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 16 - LSPB: 16 - LVCA: 4 + LSPA: 96 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LVPA: 24 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -55436,10 +55680,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55447,19 +55691,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -55467,6 +55716,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55476,6 +55726,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55485,6 +55736,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55499,35 +55751,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 354 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -55537,9 +55799,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -55552,26 +55814,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 + LSPA: 64 + LSPB: 128 + LVCA: 4 LVCB: 2 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -55585,10 +55848,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55596,26 +55859,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55625,6 +55896,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55634,6 +55906,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55648,35 +55921,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 355 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id006 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -55686,41 +55967,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 16 + LSPA: 64 + LSPB: 128 LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -55734,10 +56016,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55745,19 +56027,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -55765,6 +56054,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55774,6 +56064,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55783,6 +56074,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55797,75 +56089,84 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 356 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 3200 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -55882,11 +56183,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55894,26 +56195,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55923,6 +56232,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55932,6 +56242,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55946,35 +56257,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 357 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -55984,41 +56303,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -56032,10 +56352,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56043,19 +56363,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56063,6 +56390,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56072,6 +56400,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56081,6 +56410,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56095,79 +56425,88 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 358 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id009 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -56180,11 +56519,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56192,19 +56531,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56212,6 +56556,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56221,6 +56566,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56230,6 +56576,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56244,35 +56591,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 359 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -56282,41 +56639,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -56330,10 +56688,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56341,19 +56699,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56361,6 +56726,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56370,6 +56736,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56379,6 +56746,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56393,48 +56761,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 360 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -56446,26 +56822,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 4 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -56478,11 +56855,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56490,19 +56867,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56510,6 +56892,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56519,6 +56902,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56528,6 +56912,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56542,35 +56927,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 361 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 - ThreadTile0: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id006 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -56580,58 +56975,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1792 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56639,19 +57035,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56659,6 +57060,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56668,6 +57070,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56677,6 +57080,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56691,35 +57095,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 362 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -56729,8 +57143,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -56744,22 +57158,23 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 3328 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -56778,29 +57193,38 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56808,6 +57232,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56817,6 +57242,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56826,8 +57252,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -56840,35 +57268,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 363 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -56878,37 +57314,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 3584 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -56927,29 +57364,38 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56957,6 +57403,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56966,6 +57413,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56975,8 +57423,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -56989,35 +57439,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 364 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id008 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -57027,37 +57485,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 3200 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -57076,29 +57535,38 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -57106,6 +57574,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57115,6 +57584,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57124,8 +57594,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57138,35 +57610,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 365 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -57176,41 +57656,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -57224,30 +57705,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -57255,6 +57745,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57264,6 +57755,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57273,8 +57765,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57287,35 +57781,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 366 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id009 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -57325,8 +57827,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -57339,27 +57841,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -57373,30 +57876,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -57404,6 +57914,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57413,6 +57924,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57422,8 +57934,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57436,35 +57950,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 367 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -57474,8 +57998,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -57488,23 +58012,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 3584 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -57523,29 +58048,36 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -57553,6 +58085,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57562,6 +58095,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57571,8 +58105,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57585,48 +58121,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 368 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -57634,67 +58180,75 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 32 - LVPB: 32 - LdcEqualsLdd: false + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -57702,6 +58256,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57711,6 +58266,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57720,8 +58276,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57734,14 +58292,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 369 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -57752,105 +58317,113 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57860,6 +58433,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57869,8 +58443,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57883,39 +58459,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 370 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_PGR0_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -57923,76 +58509,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 8 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 32 - LVPB: 32 - LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58000,6 +58596,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58009,6 +58606,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58018,8 +58616,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58032,33 +58632,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 371 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU08_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58081,67 +58689,77 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58149,6 +58767,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58158,6 +58777,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58167,8 +58787,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58181,39 +58803,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 372 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -58221,7 +58851,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -58230,36 +58860,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2048 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -58267,37 +58898,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58307,6 +58948,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58316,8 +58958,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58330,33 +58974,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 373 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58379,74 +59031,85 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58456,6 +59119,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58465,8 +59129,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58479,33 +59145,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 374 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58519,76 +59193,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58596,6 +59280,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58605,6 +59290,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58614,8 +59300,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58628,33 +59316,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 375 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58668,76 +59364,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58745,6 +59451,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58754,6 +59461,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58763,8 +59471,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58777,48 +59487,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 376 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -58826,36 +59544,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1792 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -58863,30 +59582,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58894,6 +59620,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58903,6 +59630,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58912,8 +59640,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58926,33 +59656,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 377 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58965,77 +59705,85 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 3328 + LdcEqualsLdd: true + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59043,6 +59791,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59052,6 +59801,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59061,8 +59811,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59075,47 +59827,57 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 378 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -59124,36 +59886,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -59161,37 +59924,45 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59201,6 +59972,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59210,8 +59982,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59224,33 +59998,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 379 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 1 - WorkGroupMappingType: B + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -59263,77 +60047,85 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59341,6 +60133,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59350,6 +60143,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59359,8 +60153,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59373,33 +60169,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 380 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -59412,7 +60218,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -59422,67 +60228,75 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59490,6 +60304,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59499,6 +60314,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59508,8 +60324,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59522,39 +60340,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 381 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -59562,33 +60390,34 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 @@ -59600,7 +60429,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -59608,37 +60437,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59648,6 +60487,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59657,8 +60497,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59671,39 +60513,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 382 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -59711,8 +60561,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -59720,74 +60570,85 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59797,6 +60658,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59806,8 +60668,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59820,39 +60684,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 383 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -59860,65 +60732,68 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -59926,17 +60801,25 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59946,6 +60829,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59955,8 +60839,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59969,39 +60855,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 384 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -60009,45 +60903,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -60055,19 +60950,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -60075,17 +60972,25 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60095,6 +61000,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60104,8 +61010,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60118,33 +61026,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 385 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -60167,13 +61083,14 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -60183,8 +61100,8 @@ LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 16384 + LdcEqualsLdd: true + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 @@ -60196,14 +61113,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -60211,7 +61128,9 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -60224,10 +61143,17 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60235,6 +61161,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60244,6 +61171,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60253,8 +61181,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60267,75 +61197,84 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 386 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 16 + LSPB: 64 LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -60345,45 +61284,53 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60393,6 +61340,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60402,8 +61350,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60416,85 +61366,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 387 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id020 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -60502,37 +61463,45 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60542,6 +61511,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60551,8 +61521,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60565,73 +61537,84 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 388 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 @@ -60643,7 +61626,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -60651,30 +61634,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60682,6 +61672,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60691,6 +61682,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60700,8 +61692,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60714,80 +61708,21597 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 389 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 - LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 390 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 391 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 392 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 393 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 394 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 395 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 396 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 397 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 398 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 399 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 400 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 401 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 402 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 403 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 404 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 405 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 406 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 407 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 408 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 409 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 410 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 411 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 412 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 413 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW02_GSU32_SNLL0_TT04_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 414 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 2, 8] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 5120 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 415 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 13312 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 416 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 12 + LSPB: 12 + LVCA: 16 + LVCB: 16 + LVPA: 12 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3392 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 36 + MacroTile1: 48 + MacroTileA: 36 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 417 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT036x048x16_GRVW01_GSU02_SNLL0_TT03_03_VW01_WG12_16_01 + SubGroup0: 12 + SubGroup1: 16 + SubGroupA: 12 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id004 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [12, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 12 + LSPB: 12 + LVCA: 16 + LVCB: 16 + LVPA: 12 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3456 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 36 + MacroTileA: 48 + MacroTileB: 36 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 418 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x036x16_GRVW01_GSU08_SNLL0_TT06_03_VW01_WG08_12_02 + SubGroup0: 8 + SubGroup1: 12 + SubGroupA: 8 + SubGroupB: 12 + SuppresssNoLoadLoop: false + ThreadTile: [6, 3] + ThreadTile0: 6 + ThreadTile1: 3 + ThreadTileA: 6 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id003 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 24 + LSPB: 24 + LVCA: 8 + LVCB: 8 + LVPA: 12 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 48 + MacroTileA: 48 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 419 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW02_GSU08_SNLL0_TT06_04_VW02_WG08_12_02 + SubGroup0: 8 + SubGroup1: 12 + SubGroupA: 8 + SubGroupB: 12 + SuppresssNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id003 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 48 + MacroTile1: 48 + MacroTileA: 48 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 420 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW01_GSU08_SNLL0_TT03_03_VW01_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id004 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 421 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id009 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 422 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 423 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 424 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 425 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 426 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 427 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 428 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 429 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 430 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id009 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 431 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 432 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 433 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 434 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 435 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 436 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 437 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 438 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id009 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 439 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 440 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 441 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 442 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 443 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU08_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 444 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 445 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 446 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 447 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 448 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 449 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 450 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 451 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 452 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 453 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 454 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 455 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 456 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 457 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 458 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 459 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id020 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 460 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 461 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 462 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 463 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 464 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 465 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 466 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 467 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 468 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 469 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 470 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 471 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 472 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 473 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 474 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 475 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 476 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 477 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 478 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 479 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 480 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 481 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 13312 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 482 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 483 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 484 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 485 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 486 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 487 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id020 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 488 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 489 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 490 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 491 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 492 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 493 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 494 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 495 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL0_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: [6, 8] + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 496 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id024 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 497 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 498 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 499 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id024 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 500 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 501 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 502 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 503 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 504 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 505 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 506 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id029 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 507 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 508 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id031 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 509 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 510 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 511 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 512 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id029 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 513 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 514 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id031 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 515 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 516 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 517 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 518 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 519 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 520 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 521 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 522 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 523 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 4 + LSCB: 4 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 524 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id032 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 2 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 2 + LSCB: 2 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 525 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id032 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 526 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 527 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 528 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 529 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 530 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -60863,14 +83374,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 390 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SolutionIndex: 531 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id033 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 @@ -60881,13 +83392,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 + WorkGroup: *id035 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -60901,9 +83411,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -60911,33 +83420,325 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 1 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 532 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_08_02_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id038 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 1 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 533 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id037 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6752 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -60948,10 +83749,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -60960,14 +83761,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -61012,31 +83811,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 391 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SolutionIndex: 534 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: *id036 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 1 + WorkGroup: *id037 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -61044,15 +83843,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -61060,37 +83858,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -61098,9 +83896,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -61109,14 +83907,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -61161,31 +83957,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 392 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + SolutionIndex: 535 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM08 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 4 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: *id037 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -61193,53 +83989,52 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -61247,10 +84042,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61258,15 +84053,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -61310,31 +84103,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 393 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SolutionIndex: 536 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW01_GSU01_LPA02_LPB02_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: *id039 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + WorkGroup: *id037 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -61342,53 +84135,52 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -61397,9 +84189,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61407,15 +84199,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -61459,31 +84249,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 394 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionIndex: 537 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW01_GSU08_LPA02_LPB02_PGR1_PLR1_TT02_02_USFGRO01_VW02_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id037 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -61497,43 +84287,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -61556,15 +84345,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -61608,31 +84395,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 395 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 + SolutionIndex: 538 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_08_02_WGM01 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 + ThreadTile: *id036 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + WorkGroup: *id038 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -61646,8 +84433,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -61656,11 +84442,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -61674,30 +84460,30 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61705,14 +84491,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -61757,31 +84541,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 396 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + SolutionIndex: 539 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG32_08_01_WGM01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id039 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id040 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -61795,58 +84579,57 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61854,13 +84637,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -61906,33 +84687,34 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 397 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 + SolutionIndex: 540 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + ThreadTile: *id039 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id040 + WorkGroupMapping: 1 WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -61945,57 +84727,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -62003,15 +84785,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -62055,33 +84840,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 398 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 541 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62094,7 +84889,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -62104,47 +84899,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -62152,15 +84947,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -62204,33 +85002,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 399 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 542 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62243,57 +85051,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -62301,15 +85109,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -62353,33 +85164,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 400 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 543 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62392,57 +85213,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -62450,15 +85271,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -62502,33 +85326,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 401 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 544 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62541,57 +85375,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -62599,15 +85433,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -62651,46 +85488,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 402 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 545 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -62700,47 +85547,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 64 LSPB: 64 - LVCA: 2 - LVCB: 2 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -62748,15 +85595,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -62800,46 +85650,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 403 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 546 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -62849,43 +85709,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 64 LSPB: 64 - LVCA: 2 - LVCB: 2 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -62897,15 +85757,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -62949,46 +85812,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 404 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 547 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -62998,43 +85871,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 64 LSPB: 64 - LVCA: 2 - LVCB: 2 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -63046,15 +85919,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -63098,47 +85974,57 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 405 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 548 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -63147,47 +86033,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 + LSCA: 16 + LSCB: 16 + LSPA: 32 LSPB: 64 - LVCA: 2 - LVCB: 2 + LVCA: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -63195,15 +86081,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -63247,46 +86136,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 406 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 549 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -63296,22 +86195,22 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -63325,7 +86224,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -63333,10 +86232,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -63344,15 +86243,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -63396,48 +86298,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 407 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 550 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -63445,26 +86357,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3328 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -63474,7 +86386,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -63482,9 +86394,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -63493,15 +86405,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -63545,46 +86460,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 408 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 551 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -63594,26 +86519,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -63623,18 +86548,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -63642,15 +86567,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -63694,46 +86622,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 409 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 552 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -63743,36 +86681,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13312 - LdsNumElementsAlignedA: 4096 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -63780,10 +86718,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -63793,13 +86731,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -63843,46 +86784,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 410 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 553 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -63892,47 +86843,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -63940,15 +86891,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -63992,46 +86946,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 411 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 554 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -64041,47 +87005,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -64089,20 +87049,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -64141,46 +87104,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 412 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 555 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -64190,47 +87163,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -64240,13 +87213,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -64290,46 +87266,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 413 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 556 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -64339,26 +87325,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -64368,7 +87354,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -64376,10 +87362,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -64387,15 +87373,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -64439,96 +87428,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 414 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 557 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -64536,15 +87535,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -64588,46 +87590,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 415 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 558 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id020 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -64637,43 +87649,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -64686,14 +87698,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -64737,46 +87752,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 416 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 559 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_8_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -64786,46 +87811,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -64834,15 +87859,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -64886,48 +87914,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 417 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 560 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -64935,46 +87973,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 LSPB: 32 - LVCA: 8 + LVCA: 4 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -64983,15 +88021,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -65035,46 +88076,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 418 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 561 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -65084,47 +88135,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -65132,15 +88183,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -65184,46 +88238,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 419 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 562 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -65233,36 +88297,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1600 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -65281,20 +88341,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -65333,35 +88396,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 420 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 563 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id017 + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -65372,7 +88445,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -65385,11 +88458,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 @@ -65399,14 +88472,10 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -65431,19 +88500,22 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -65482,35 +88554,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 421 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 564 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -65520,8 +88602,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -65531,31 +88613,31 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -65569,9 +88651,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -65579,15 +88661,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -65631,35 +88716,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 422 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 565 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id023 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -65669,42 +88764,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -65717,10 +88812,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -65728,15 +88823,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -65780,35 +88878,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 423 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL0_TT06_08_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 566 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: [6, 8] - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -65818,10 +88926,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -65832,28 +88940,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -65867,9 +88975,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -65877,15 +88985,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -65929,35 +89040,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 424 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id024 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 567 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -65967,8 +89088,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -65978,31 +89099,31 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -66016,9 +89137,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66027,14 +89148,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66078,48 +89202,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 425 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 568 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -66131,27 +89265,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 64 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -66163,11 +89297,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66175,15 +89309,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66227,46 +89364,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 426 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id023 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 569 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x64x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -66280,27 +89427,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -66312,11 +89459,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66324,15 +89471,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66376,46 +89526,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 427 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 570 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id024 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -66425,31 +89585,31 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -66461,7 +89621,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -66475,13 +89635,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66525,46 +89688,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 428 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 571 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -66578,27 +89751,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -66610,7 +89783,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -66624,13 +89797,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66674,46 +89850,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 429 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 572 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -66727,27 +89913,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -66759,11 +89945,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66771,15 +89957,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66823,46 +90012,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 430 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 573 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -66876,27 +90075,23 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetB: 2112 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -66908,11 +90103,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66920,20 +90115,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -66972,33 +90170,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 431 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 574 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -67011,7 +90219,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -67025,8 +90233,8 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -67038,14 +90246,14 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -67058,9 +90266,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -67069,15 +90277,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67121,33 +90332,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 432 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 575 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -67160,7 +90381,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -67174,8 +90395,8 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -67187,14 +90408,14 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -67208,9 +90429,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67218,15 +90439,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67270,33 +90494,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 433 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 576 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -67308,9 +90542,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -67318,32 +90552,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 32 - LVCA: 8 + LVCA: 4 LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2624 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 - LdsPadA: 0 + LdsOffsetB: 2112 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -67356,10 +90586,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67367,20 +90597,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -67419,33 +90652,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 434 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id029 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 577 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -67458,7 +90701,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -67468,31 +90711,31 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -67505,10 +90748,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67516,15 +90759,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67568,80 +90814,90 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 435 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 578 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id030 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -67653,11 +90909,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67665,15 +90921,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67717,48 +90976,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 436 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id031 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 579 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -67769,29 +91038,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3600 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -67802,11 +91071,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67814,15 +91083,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67866,33 +91138,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 437 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 580 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -67905,7 +91187,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -67915,32 +91197,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -67953,9 +91235,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67964,14 +91246,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68015,33 +91300,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 438 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 581 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -68054,7 +91349,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -68064,12 +91359,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -68081,15 +91376,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 6176 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68102,9 +91397,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68112,15 +91407,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68164,33 +91462,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 439 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 582 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -68203,42 +91511,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6176 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68250,10 +91558,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68261,15 +91569,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68313,33 +91624,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 440 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 583 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id029 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -68352,7 +91673,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -68362,32 +91683,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68399,10 +91720,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68410,15 +91731,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68462,47 +91786,57 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 441 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 584 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id030 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -68510,33 +91844,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68547,11 +91881,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68559,15 +91893,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68611,33 +91948,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 442 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id031 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 585 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 + VectorWidth: 4 + WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -68650,7 +91997,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -68664,28 +92011,28 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68697,10 +92044,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68708,15 +92055,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68760,46 +92110,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 443 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 586 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -68813,28 +92173,24 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68845,11 +92201,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68857,20 +92213,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -68909,46 +92268,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 444 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 587 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -68958,32 +92327,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68994,27 +92363,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 MinGlobalWriteVectorWidth: 1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69058,46 +92430,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 445 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 588 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69107,32 +92489,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -69143,7 +92525,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -69157,13 +92539,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69207,46 +92592,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 446 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 589 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69260,28 +92655,28 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -69292,7 +92687,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -69306,13 +92701,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69356,46 +92754,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 447 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 590 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69409,28 +92817,28 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 13376 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 4160 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 9216 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -69441,11 +92849,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69455,13 +92863,16 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69505,46 +92916,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 448 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 591 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_8_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69558,28 +92979,28 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -69590,11 +93011,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69602,15 +93023,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69654,48 +93078,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 449 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 592 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -69707,28 +93141,24 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 LSPB: 32 - LVCA: 8 + LVCA: 4 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 2624 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -69739,11 +93169,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69751,20 +93181,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -69803,46 +93236,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 450 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 593 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69852,32 +93295,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -69888,11 +93331,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69900,15 +93343,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69952,81 +93398,91 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 451 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 594 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 4 - LSCB: 4 - LSPA: 16 - LSPB: 16 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -70037,11 +93493,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70049,15 +93505,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70101,48 +93560,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 452 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 595 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id032 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -70157,25 +93626,25 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 2 - LSCB: 2 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -70186,10 +93655,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -70200,13 +93669,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70250,33 +93722,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 453 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 596 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id032 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -70289,57 +93771,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 12864 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 4160 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 512 + LdsOffsetB_Blk: 8704 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70347,15 +93829,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70399,96 +93884,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 454 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 597 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70496,15 +93991,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70548,96 +94046,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 455 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 598 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70645,15 +94153,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70697,96 +94208,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 456 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 599 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70794,15 +94315,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70846,96 +94370,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 457 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 600 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70943,15 +94477,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70995,96 +94532,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 458 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 601 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 9216 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71092,15 +94639,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71144,94 +94694,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 459 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 602 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: false + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 9216 + LdsPadA: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71239,13 +94801,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71289,13 +94856,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 460 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_08_02_WGM01 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id036 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 603 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -71305,79 +94880,82 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id038 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 32 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3408 LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 LdsOffsetB_Blk: 3136 - LdsPadA: 1 - LdsPadB: 1 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71385,13 +94963,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71435,13 +95018,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 461 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id036 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 604 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -71451,79 +95042,82 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id037 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 2112 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71531,13 +95125,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71581,13 +95180,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 462 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 605 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id036 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -71597,79 +95204,82 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id037 + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 6752 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71677,13 +95287,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71727,13 +95342,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 463 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM08 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id036 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 606 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -71743,39 +95366,42 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id037 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -71783,39 +95409,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2112 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71823,13 +95449,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71873,95 +95504,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 464 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW01_GSU01_LPA02_LPB02_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_04_04_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 607 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id039 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id037 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3456 + LdsNumElements: 7264 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetB_Blk: 5184 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71969,13 +95611,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72019,33 +95666,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 465 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW01_GSU08_LPA02_LPB02_PGR1_PLR1_TT02_02_USFGRO01_VW02_WG16_04_04_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 608 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id037 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72057,56 +95714,57 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1088 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72115,13 +95773,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72165,13 +95828,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 466 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_08_02_WGM01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 609 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - ThreadTile: *id036 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -72181,17 +95852,19 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id038 + VectorWidth: 2 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72203,42 +95876,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -72250,10 +95924,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72261,13 +95935,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72311,33 +95990,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 467 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG32_08_01_WGM01 - SubGroup0: 32 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 610 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - ThreadTile: *id039 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id040 + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72349,6 +96038,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -72359,12 +96049,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -72376,15 +96066,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -72396,9 +96082,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72407,18 +96093,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -72457,26 +96148,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 468 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id039 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 611 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU5_LPA2_LPB2_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id040 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -72507,7 +96207,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -72524,15 +96224,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -72544,10 +96244,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72555,12 +96255,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -72614,8 +96314,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 469 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 612 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -72623,12 +96323,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -72669,7 +96369,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -72679,22 +96379,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -72706,10 +96406,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72717,13 +96417,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -72776,20 +96476,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 470 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 613 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -72797,7 +96497,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -72831,7 +96531,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -72848,15 +96548,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -72868,9 +96568,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72879,11 +96579,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -72938,28 +96638,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 471 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 614 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -72993,7 +96693,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -73003,22 +96703,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73030,10 +96730,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73041,13 +96741,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -73100,29 +96800,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 472 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 615 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -73155,7 +96855,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -73172,15 +96872,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73262,8 +96962,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 473 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 616 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73271,7 +96971,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -73284,7 +96984,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -73317,7 +97017,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -73334,15 +97034,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73355,9 +97055,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73365,12 +97065,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -73424,8 +97124,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 474 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 617 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73433,12 +97133,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 8] + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -73446,7 +97146,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -73479,7 +97179,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -73496,15 +97196,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73517,9 +97217,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73527,12 +97227,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -73586,20 +97286,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 475 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 618 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -73607,7 +97307,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -73632,7 +97332,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -73641,7 +97341,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -73651,22 +97351,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73678,10 +97378,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73694,7 +97394,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -73748,15 +97448,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 476 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 619 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -73769,8 +97469,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -73794,7 +97494,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -73813,22 +97513,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 64 - LVCA: 8 + LVCA: 4 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73840,9 +97540,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -73851,8 +97551,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -73910,8 +97610,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 477 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM64 + SolutionIndex: 620 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73920,10 +97620,10 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -73932,7 +97632,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -73982,15 +97682,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74072,8 +97772,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 478 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 621 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -74094,7 +97794,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -74110,7 +97810,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -74119,7 +97819,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -74127,32 +97827,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 32 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 32 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74163,11 +97863,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74175,12 +97875,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -74234,31 +97934,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 479 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + SolutionIndex: 622 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -74292,7 +97992,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -74306,15 +98006,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74326,10 +98026,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74337,12 +98037,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -74396,8 +98096,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 480 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 623 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -74406,19 +98106,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -74451,10 +98151,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -74468,15 +98168,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74489,9 +98189,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74499,12 +98199,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -74558,8 +98258,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 481 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 624 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_8_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -74567,20 +98267,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -74613,10 +98313,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -74630,15 +98330,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74650,9 +98350,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -74661,11 +98361,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -74720,8 +98420,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 482 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 625 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -74729,20 +98429,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -74764,7 +98464,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -74775,10 +98475,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -74792,11 +98492,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 2048 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74820,7 +98524,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -74835,7 +98539,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -74878,29 +98582,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 483 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 626 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -74933,10 +98637,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -74950,15 +98654,15 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74982,7 +98686,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -75040,8 +98744,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 484 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 627 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -75049,7 +98753,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -75060,7 +98764,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 8, 1] WorkGroupMapping: 64 WorkGroupMappingType: B @@ -75095,10 +98799,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -75112,15 +98816,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75132,10 +98836,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75143,12 +98847,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -75202,8 +98906,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 485 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 628 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -75211,18 +98915,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 64 WorkGroupMappingType: B @@ -75240,7 +98944,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -75257,32 +98961,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 4160 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 9280 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75293,11 +98997,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75305,12 +99009,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -75364,31 +99068,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 486 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + SolutionIndex: 629 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -75402,7 +99106,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -75427,24 +99131,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 2 - LVCB: 2 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 512 + LdsNumElements: 14464 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 4160 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 LdsPadA: 2 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75455,11 +99159,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75471,9 +99175,9 @@ NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -75526,31 +99230,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 487 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_8_VW2_WG16_8_1_WGM8 + SolutionIndex: 630 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -75564,7 +99268,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -75581,7 +99285,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -75589,24 +99293,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 LdsPadA: 2 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75617,11 +99321,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75634,8 +99338,8 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -75688,15 +99392,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 488 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 631 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -75709,10 +99413,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -75726,7 +99430,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -75743,32 +99447,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 16 + LVCA: 2 + LVCB: 4 + LVPA: 32 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3424 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75779,7 +99483,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 32 MacroTileA: 128 @@ -75792,10 +99496,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -75850,8 +99554,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 489 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + SolutionIndex: 632 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -75859,7 +99563,7 @@ SubGroup1: 8 SubGroupA: 32 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -75870,11 +99574,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -75888,7 +99592,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -75897,7 +99601,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -75905,32 +99609,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3680 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75941,10 +99645,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -75953,7 +99657,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -76012,31 +99716,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 490 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 633 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -76056,7 +99760,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -76070,7 +99774,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -76084,11 +99788,15 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1600 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76112,7 +99820,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -76127,7 +99835,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -76170,8 +99878,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 491 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + SolutionIndex: 634 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -76179,7 +99887,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -76190,9 +99898,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -76214,9 +99922,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -76228,25 +99936,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76258,10 +99970,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76270,11 +99982,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -76285,7 +99997,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -76328,16 +100040,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 492 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 635 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -76348,9 +100060,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -76383,32 +100095,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 6272 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76421,9 +100133,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76432,12 +100144,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -76490,16 +100202,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 493 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + SolutionIndex: 636 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -76510,9 +100222,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -76545,10 +100257,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -76562,15 +100274,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76582,9 +100294,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -76593,11 +100305,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -76652,8 +100364,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 494 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 637 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -76661,20 +100373,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -76710,7 +100422,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -76724,15 +100436,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6720 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2112 LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 0 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76756,7 +100468,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -76814,8 +100526,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 495 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + SolutionIndex: 638 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -76834,9 +100546,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -76869,32 +100581,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76906,10 +100618,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76917,13 +100629,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -76976,29 +100688,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 496 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + SolutionIndex: 639 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -77014,13 +100726,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 @@ -77039,24 +100751,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4736 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 4160 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77067,11 +100775,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 256 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 256 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77079,11 +100787,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -77095,7 +100803,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -77138,8 +100846,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 497 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x64x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG32_8_1_WGM1 + SolutionIndex: 640 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT8_4_VW4_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -77147,12 +100855,12 @@ SubGroup1: 8 SubGroupA: 32 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -77162,7 +100870,7 @@ WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -77176,7 +100884,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -77193,7 +100901,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -77201,24 +100909,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77229,11 +100937,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77241,11 +100949,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -77300,8 +101008,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 498 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 641 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA4_LPB4_PGR1_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -77309,12 +101017,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -77322,9 +101030,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -77365,22 +101073,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3712 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 + LdsOffsetB_Blk: 3136 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77393,9 +101101,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77407,9 +101115,9 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -77462,15 +101170,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 499 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 642 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -77483,7 +101191,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -77534,15 +101242,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77555,9 +101263,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77565,12 +101273,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -77624,20 +101332,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 500 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 643 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -77645,7 +101353,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -77696,15 +101404,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77716,9 +101424,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -77727,11 +101435,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -77786,8 +101494,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 501 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 644 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -77796,10 +101504,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -77830,7 +101538,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -77858,11 +101566,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 2112 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77874,10 +101586,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77889,8 +101601,8 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -77901,7 +101613,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -77944,16 +101656,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 502 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 645 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 @@ -77965,7 +101677,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -78016,15 +101728,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 7296 LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2112 LdsOffsetB_Blk: 6208 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78106,28 +101818,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 503 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 646 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -78178,15 +101890,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78198,10 +101910,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78209,12 +101921,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -78268,8 +101980,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 504 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 647 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -78278,11 +101990,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -78290,7 +102002,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -78340,11 +102052,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2624 + LdsNumElements: 2688 LdsOffsetA: 0 LdsOffsetB: 2112 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78426,8 +102138,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 505 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM64 + SolutionIndex: 648 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -78448,7 +102160,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -78473,7 +102185,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -78481,7 +102193,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -78491,22 +102203,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78518,9 +102230,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -78535,7 +102247,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -78588,16 +102300,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 506 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 649 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -78609,8 +102321,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -78626,7 +102338,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -78651,24 +102363,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78679,11 +102391,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78691,12 +102403,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -78750,31 +102462,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 507 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 650 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -78788,7 +102500,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -78797,7 +102509,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -78805,32 +102517,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 LVCB: 4 - LVPA: 32 - LVPB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3600 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 2 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78841,11 +102553,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78853,13 +102565,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -78912,31 +102624,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 508 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 651 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -78967,10 +102679,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -78984,15 +102696,15 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 2 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79016,7 +102728,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -79074,8 +102786,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 509 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 652 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -79083,7 +102795,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -79094,9 +102806,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -79120,7 +102832,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -79129,32 +102841,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6176 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 2 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79166,10 +102878,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79178,11 +102890,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -79236,16 +102948,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 510 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 653 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -79256,9 +102968,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -79291,10 +103003,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -79308,15 +103020,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6176 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 2 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79329,9 +103041,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79339,12 +103051,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -79398,28 +103110,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 511 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 654 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 32, 1] WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -79453,32 +103165,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79491,9 +103203,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79502,12 +103214,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -79560,16 +103272,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 512 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + SolutionIndex: 655 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -79580,8 +103292,8 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -79598,7 +103310,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -79607,7 +103319,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -79623,23 +103335,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 + LSCA: 16 + LSCB: 16 + LSPA: 64 LSPB: 64 - LVCA: 2 + LVCA: 4 LVCB: 4 - LVPA: 32 - LVPB: 32 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -79651,11 +103363,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79668,7 +103380,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -79722,15 +103434,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 513 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 656 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 @@ -79743,10 +103455,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -79787,21 +103499,21 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -79814,10 +103526,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79825,13 +103537,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -79884,20 +103596,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 514 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 657 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -79905,8 +103617,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -79928,7 +103640,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -79956,10 +103668,14 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 2048 - LdsPadA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -79973,9 +103689,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79983,12 +103699,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -79999,7 +103715,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -80042,8 +103758,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 515 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 658 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -80051,12 +103767,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -80064,7 +103780,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -80089,7 +103805,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -80097,7 +103813,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -80107,21 +103823,21 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -80134,9 +103850,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80151,7 +103867,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -80204,16 +103920,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 516 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 659 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -80225,8 +103941,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -80259,7 +103975,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -80276,14 +103992,14 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -80296,9 +104012,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80307,11 +104023,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -80366,29 +104082,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 517 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 660 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -80412,7 +104128,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -80421,7 +104137,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -80431,21 +104147,21 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 576 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -80458,9 +104174,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80469,8 +104185,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -80528,8 +104244,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 518 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + SolutionIndex: 661 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -80537,11 +104253,11 @@ SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -80550,7 +104266,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -80566,7 +104282,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80591,23 +104307,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 13376 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 13568 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 9216 - LdsPadA: 0 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -80619,11 +104335,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80631,8 +104347,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -80690,8 +104406,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 519 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_8_VW4_WG8_32_1_WGM8 + SolutionIndex: 662 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -80700,11 +104416,11 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -80712,9 +104428,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -80728,7 +104444,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80753,23 +104469,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -80781,11 +104497,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80793,11 +104509,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -80852,8 +104568,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 520 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 663 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -80862,11 +104578,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -80874,9 +104590,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -80890,16 +104606,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -80907,7 +104623,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -80915,19 +104631,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 32 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 4 + LVCA: 8 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2624 + LdsNumElements: 13568 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetB: 2048 - LdsPadA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -80939,11 +104659,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80955,8 +104675,8 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -80967,7 +104687,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -81010,15 +104730,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 521 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 664 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -81031,10 +104751,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -81048,7 +104768,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -81065,7 +104785,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -81073,23 +104793,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -81101,11 +104821,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81118,8 +104838,8 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -81172,16 +104892,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 522 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 665 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -81193,10 +104913,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -81210,7 +104930,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -81227,7 +104947,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -81235,23 +104955,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 14592 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -81263,11 +104983,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81275,12 +104995,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -81334,8 +105054,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 523 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 666 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -81343,12 +105063,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -81358,7 +105078,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -81372,7 +105092,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -81397,23 +105117,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -81425,11 +105145,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81442,8 +105162,8 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -81496,15 +105216,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 524 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 667 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -81517,10 +105237,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -81534,7 +105254,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -81542,7 +105262,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -81559,23 +105279,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 - LSPB: 64 + LSPB: 32 LVCA: 8 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 12864 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 14592 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 512 - LdsOffsetB_Blk: 8704 - LdsPadA: 0 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -81587,11 +105307,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81603,7 +105323,7 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 @@ -81658,15 +105378,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 525 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG8_32_1_WGM64 + SolutionIndex: 668 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 @@ -81679,10 +105399,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -81705,7 +105425,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -81716,7 +105436,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -81724,21 +105444,21 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81751,9 +105471,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81761,12 +105481,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -81793,6 +105513,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -81802,6 +105523,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -81820,29 +105542,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 526 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 669 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -81878,7 +105600,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -81892,15 +105614,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81913,9 +105635,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81923,12 +105645,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -81955,6 +105677,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -81964,6 +105687,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -81982,8 +105706,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 527 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 670 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -81992,19 +105716,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -82020,7 +105744,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82029,7 +105753,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -82040,44 +105764,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82085,13 +105809,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82117,6 +105841,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82126,6 +105851,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -82144,31 +105870,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 528 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + SolutionIndex: 671 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -82182,7 +105908,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82199,32 +105925,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 2 + LVCB: 2 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -82235,10 +105961,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -82248,12 +105974,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82279,6 +106005,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82288,6 +106015,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -82306,16 +106034,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 529 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 672 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -82326,11 +106054,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -82344,7 +106072,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82353,7 +106081,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -82361,47 +106089,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 32 - LVCA: 8 + LSPB: 16 + LVCA: 4 LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 9216 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 2 MacroTile0: 32 - MacroTile1: 128 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 128 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82409,13 +106137,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82441,6 +106169,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82450,6 +106179,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -82468,31 +106198,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 530 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM8 + SolutionIndex: 673 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW2_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] + VectorWidth: 2 + WorkGroup: [4, 4, 8] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -82515,7 +106245,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -82526,44 +106256,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 8 LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 9216 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82571,13 +106301,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82603,6 +106333,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82612,6 +106343,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -82630,29 +106362,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 531 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 674 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -82668,7 +106400,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82688,44 +106420,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 32 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3408 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82733,13 +106465,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82765,6 +106497,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82774,6 +106507,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -82792,31 +106526,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 532 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + SolutionIndex: 675 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -82830,7 +106564,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82847,7 +106581,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -82855,24 +106589,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 2 + LVCB: 2 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -82883,10 +106617,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -82899,9 +106633,9 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82927,6 +106661,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82936,6 +106671,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -82954,16 +106690,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 533 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 676 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -82975,10 +106711,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -82992,7 +106728,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -83000,8 +106736,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -83009,47 +106745,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 64 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 LVCA: 8 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83057,13 +106793,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -83089,6 +106825,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83098,6 +106835,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83116,31 +106854,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 534 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + SolutionIndex: 677 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -83154,7 +106892,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -83171,7 +106909,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -83179,39 +106917,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83219,13 +106957,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -83251,6 +106989,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83260,6 +106999,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83278,20 +107018,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 535 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 678 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT8_4_VW2_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -83299,10 +107039,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -83316,7 +107056,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -83325,7 +107065,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -83333,47 +107073,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 8 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83381,13 +107121,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -83413,6 +107153,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83422,6 +107163,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83440,31 +107182,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 536 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 679 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 8] + SubGroup0: 8 + SubGroup1: 2 + SubGroupA: 8 + SubGroupB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 2, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -83478,7 +107220,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -83495,32 +107237,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -83531,11 +107273,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83543,11 +107285,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -83575,6 +107317,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83584,6 +107327,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83602,31 +107346,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 537 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + SolutionIndex: 680 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -83640,7 +107384,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -83657,32 +107401,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 2 + LVCB: 2 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -83693,10 +107437,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -83706,12 +107450,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -83737,6 +107481,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83746,6 +107491,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83764,16 +107510,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 538 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 681 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -83784,11 +107530,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [8, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -83808,7 +107554,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -83819,43 +107565,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 16 + LSPB: 16 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2144 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83863,13 +107613,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -83879,7 +107629,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -83895,6 +107645,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83904,6 +107655,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83922,28 +107674,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 539 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU5_LPA2_LPB2_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 682 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 2 + SubGroup1: 8 + SubGroupA: 2 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [2, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -83966,7 +107718,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -83977,10 +107729,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -83994,15 +107746,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84026,7 +107774,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -84041,7 +107789,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -84057,6 +107805,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84066,6 +107815,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -84084,8 +107834,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 540 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 683 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84104,7 +107854,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -84122,7 +107872,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84139,7 +107889,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -84147,39 +107897,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -84187,13 +107937,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84219,6 +107969,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84228,6 +107979,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -84246,31 +107998,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 541 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 684 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [4, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84284,7 +108036,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84293,7 +108045,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -84301,47 +108053,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 8 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -84349,13 +108101,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84381,6 +108133,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84390,6 +108143,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -84408,31 +108162,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 542 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG32_8_1_WGM1 + SolutionIndex: 685 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 8] + SubGroup0: 8 + SubGroup1: 2 + SubGroupA: 8 + SubGroupB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 2, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84446,7 +108200,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84466,29 +108220,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 2 + LVCB: 2 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84499,10 +108253,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -84512,12 +108266,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84543,6 +108297,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84552,6 +108307,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -84570,14 +108326,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 543 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_1_WGM8 + SolutionIndex: 686 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] @@ -84590,11 +108346,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [8, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84608,7 +108364,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84625,7 +108381,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -84633,39 +108389,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -84673,13 +108429,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84705,6 +108461,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84714,6 +108471,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -84732,16 +108490,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 544 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 687 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT4_4_VW4_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -84752,11 +108510,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [4, 4, 8] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84790,7 +108548,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -84804,15 +108562,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84824,9 +108582,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -84835,11 +108593,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -84867,6 +108625,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84876,6 +108635,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -84894,8 +108654,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 545 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 688 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84904,19 +108664,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -84932,7 +108692,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84949,32 +108709,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84985,11 +108745,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -84997,13 +108757,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85029,6 +108789,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85038,6 +108799,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -85056,31 +108818,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 546 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 689 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -85102,7 +108864,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -85111,47 +108873,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85159,13 +108921,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85191,6 +108953,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85200,6 +108963,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -85218,29 +108982,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 547 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 690 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 2 + SubGroup1: 8 + SubGroupA: 2 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [2, 8, 4] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85265,7 +109029,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -85273,47 +109037,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 16 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85321,13 +109085,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85353,6 +109117,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85362,6 +109127,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -85380,16 +109146,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 548 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 691 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -85400,9 +109166,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85418,7 +109184,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -85435,7 +109201,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -85443,39 +109209,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85483,13 +109249,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85515,6 +109281,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85524,6 +109291,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -85542,16 +109310,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 549 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 692 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -85562,11 +109330,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -85597,7 +109365,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -85614,15 +109382,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85634,9 +109402,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -85645,11 +109413,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -85668,6 +109436,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85677,6 +109446,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85686,6 +109456,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -85704,8 +109475,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 550 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM8 + SolutionIndex: 693 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85713,20 +109484,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85749,7 +109520,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -85776,15 +109547,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85796,10 +109567,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85807,12 +109578,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -85830,6 +109603,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85839,6 +109613,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85848,6 +109623,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -85866,8 +109642,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 551 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 694 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85876,23 +109652,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85911,7 +109685,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -85938,15 +109712,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85958,9 +109732,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -85969,11 +109743,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -85992,6 +109768,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86001,6 +109778,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86010,6 +109788,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -86028,8 +109807,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 552 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_8_VW2_WG16_16_1_WGM64 + SolutionIndex: 695 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86038,10 +109817,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -86050,11 +109829,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86083,7 +109860,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -86100,15 +109877,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86120,9 +109897,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -86131,11 +109908,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -86154,6 +109931,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86163,6 +109941,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86172,6 +109951,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -86190,8 +109970,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 553 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM64 + SolutionIndex: 696 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86199,20 +109979,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -86245,7 +110025,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -86262,15 +110042,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86283,9 +110063,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86293,12 +110073,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -86316,6 +110096,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86325,6 +110106,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86334,6 +110116,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -86352,8 +110135,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 554 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 697 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86361,12 +110144,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -86374,7 +110157,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -86397,7 +110180,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -86407,32 +110190,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86444,10 +110227,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86455,13 +110238,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -86478,6 +110263,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86487,6 +110273,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86496,6 +110283,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -86514,33 +110302,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 555 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + SolutionIndex: 698 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86559,7 +110345,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -86569,10 +110355,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -86586,15 +110372,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86606,10 +110392,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86617,12 +110403,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -86640,6 +110428,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86649,6 +110438,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86658,6 +110448,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -86676,8 +110467,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 556 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 699 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86685,24 +110476,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86714,7 +110503,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86731,32 +110520,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 9280 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86767,10 +110556,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -86779,12 +110568,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -86802,6 +110591,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86811,6 +110601,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86820,6 +110611,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -86838,31 +110630,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 557 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + SolutionIndex: 700 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -86876,7 +110668,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86896,29 +110688,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14464 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86929,10 +110721,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -86941,12 +110733,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -86964,6 +110756,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86973,6 +110766,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86982,6 +110776,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -87000,31 +110795,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 558 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 701 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -87038,14 +110833,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -87055,32 +110850,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87091,11 +110886,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87103,8 +110898,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -87126,6 +110923,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87135,6 +110933,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87144,6 +110943,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -87162,8 +110962,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 559 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 702 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87171,24 +110971,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87200,16 +110998,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -87225,24 +111023,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 32 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 32 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3424 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87253,11 +111051,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87265,12 +111063,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -87288,6 +111088,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87297,6 +111098,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87306,6 +111108,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -87324,33 +111127,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 560 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 703 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87362,7 +111163,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -87371,7 +111172,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -87387,24 +111188,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 + LSCA: 16 + LSCB: 16 + LSPA: 64 LSPB: 64 - LVCA: 2 + LVCA: 4 LVCB: 4 - LVPA: 32 - LVPB: 32 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87415,11 +111216,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87427,12 +111228,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -87450,6 +111251,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87459,6 +111261,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87468,6 +111271,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -87486,31 +111290,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 561 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 704 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -87551,22 +111355,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87578,10 +111382,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87589,13 +111393,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87612,6 +111416,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87621,6 +111426,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87630,6 +111436,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -87648,29 +111455,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 562 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 705 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -87693,16 +111500,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -87714,20 +111521,20 @@ LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 64 + LSPB: 32 LVCA: 8 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 576 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -87740,9 +111547,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -87751,12 +111558,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -87774,6 +111583,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87783,6 +111593,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87792,6 +111603,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -87810,33 +111622,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 563 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 + SolutionIndex: 706 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87855,16 +111665,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -87875,21 +111685,21 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -87903,9 +111713,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87913,12 +111723,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -87936,6 +111748,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87945,6 +111758,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87954,6 +111768,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -87972,8 +111787,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 564 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 707 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87982,11 +111797,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -87997,8 +111812,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88018,15 +111831,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -88037,21 +111850,21 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -88064,10 +111877,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88080,7 +111893,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -88098,6 +111911,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88107,6 +111921,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88116,6 +111931,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88134,8 +111950,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 565 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 708 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88144,13 +111960,13 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -88180,7 +111996,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -88188,7 +112004,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -88199,21 +112015,21 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 32 - LVCA: 4 + LVCA: 8 LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -88226,10 +112042,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88237,12 +112053,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -88260,6 +112076,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88269,6 +112086,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88278,6 +112096,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88296,28 +112115,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 566 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + SolutionIndex: 709 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -88354,7 +112173,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -88368,15 +112187,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -88388,10 +112207,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88400,11 +112219,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -88422,6 +112241,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88431,6 +112251,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88440,6 +112261,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88458,15 +112280,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 567 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM1 + SolutionIndex: 710 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 @@ -88478,8 +112300,8 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -88502,39 +112324,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4736 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 4160 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -88546,10 +112372,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88558,11 +112384,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 8 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -88573,13 +112401,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88589,6 +112418,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88598,6 +112428,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88616,33 +112447,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 568 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT8_4_VW4_WG32_8_1_WGM1 + SolutionIndex: 711 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88661,17 +112490,17 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -88681,20 +112510,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -88708,10 +112537,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88721,10 +112550,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -88742,6 +112573,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88751,6 +112583,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88760,6 +112593,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88778,8 +112612,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 569 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA4_LPB4_PGR1_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 712 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88787,14 +112621,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -88803,8 +112637,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88823,17 +112655,17 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -88843,20 +112675,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -88871,9 +112703,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88881,13 +112713,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88904,6 +112738,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88913,6 +112748,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88922,6 +112758,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88940,33 +112777,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 570 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 713 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88985,42 +112820,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 7264 LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -89044,11 +112879,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -89066,6 +112903,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89075,6 +112913,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89084,6 +112923,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89102,33 +112942,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 571 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + SolutionIndex: 714 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89147,42 +112985,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6272 + LdsNumElements: 7264 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -89195,9 +113033,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89205,12 +113043,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -89228,6 +113068,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89237,6 +113078,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89246,6 +113088,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89264,8 +113107,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 572 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 715 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -89274,23 +113117,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89310,41 +113151,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 7264 LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -89368,11 +113209,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -89390,6 +113231,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89399,6 +113241,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89408,6 +113251,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89426,8 +113270,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 573 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 716 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -89442,13 +113286,13 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -89472,41 +113316,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -89518,10 +113362,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89530,11 +113374,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -89552,6 +113396,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89561,6 +113406,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89570,6 +113416,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89588,8 +113435,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 574 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 717 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -89598,19 +113445,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -89633,16 +113480,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -89653,20 +113500,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2112 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -89680,9 +113527,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -89691,12 +113538,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -89714,6 +113563,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89723,6 +113573,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89732,6 +113583,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89750,8 +113602,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 575 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 718 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -89760,23 +113612,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89794,9 +113644,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -89804,7 +113654,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -89815,16 +113665,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 32 - LVCA: 4 + LVCA: 8 LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2688 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 2112 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -89838,10 +113692,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89849,12 +113703,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -89865,13 +113721,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89881,6 +113738,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89890,6 +113748,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89908,33 +113767,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 576 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 719 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89954,7 +113811,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -89962,7 +113819,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -89973,20 +113830,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 32 - LVCA: 4 + LVCA: 8 LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -90000,10 +113857,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90011,12 +113868,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -90034,6 +113891,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90043,6 +113901,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90052,6 +113911,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90070,29 +113930,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 577 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 720 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -90116,15 +113976,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -90135,20 +113995,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -90162,10 +114022,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90178,7 +114038,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -90196,6 +114056,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90205,6 +114066,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90214,6 +114076,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90232,15 +114095,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 578 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 721 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 @@ -90253,8 +114116,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -90277,17 +114140,17 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -90299,18 +114162,18 @@ LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -90324,10 +114187,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90335,13 +114198,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90358,6 +114223,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90367,6 +114233,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90376,6 +114243,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90394,33 +114262,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 579 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 722 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90439,16 +114305,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -90461,18 +114327,18 @@ LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -90486,10 +114352,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90497,13 +114363,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90520,6 +114388,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90529,6 +114398,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90538,6 +114408,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90556,20 +114427,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 580 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 723 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -90577,12 +114448,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90603,14 +114472,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -90622,19 +114491,19 @@ LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 64 + LSPB: 32 LVCA: 8 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -90648,10 +114517,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90659,11 +114528,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -90682,6 +114551,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90691,6 +114561,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90700,6 +114571,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90718,29 +114590,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 581 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 724 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -90763,16 +114635,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -90783,20 +114655,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -90810,10 +114682,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90823,9 +114695,11 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -90844,6 +114718,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90853,6 +114728,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90862,6 +114738,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90880,15 +114757,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 582 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 725 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [8, 4] ThreadTile0: 8 @@ -90901,12 +114778,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90926,15 +114801,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -90945,20 +114820,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -90972,9 +114847,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -90983,12 +114858,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -91006,6 +114881,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91015,6 +114891,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91024,6 +114901,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91042,8 +114920,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 583 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 726 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91052,10 +114930,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -91064,7 +114942,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -91087,7 +114965,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -91147,6 +115025,8 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -91168,6 +115048,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91177,6 +115058,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91186,6 +115068,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91204,8 +115087,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 584 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 727 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91220,17 +115103,15 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -91277,174 +115158,12 @@ LVPB: 16 LdcEqualsLdd: false LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesSummation: 1 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: true - TransposeB: false - UseBeta: true - UseInitialStrides: false - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 585 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM64 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 - UnrollMemFence: false - UseSgprForGRO: 1 - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 - WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 16 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -91458,9 +115177,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91469,11 +115188,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -91492,6 +115211,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91501,6 +115221,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91510,6 +115231,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91528,8 +115250,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 586 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 728 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91538,10 +115260,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -91550,7 +115272,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -91572,10 +115294,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -91586,29 +115308,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 32 LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91620,9 +115338,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91631,13 +115349,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91647,13 +115365,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91663,6 +115382,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91672,6 +115392,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91690,29 +115411,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 587 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM64 + SolutionIndex: 729 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB0_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -91734,43 +115455,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91782,10 +115499,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91793,13 +115510,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91809,13 +115528,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91825,6 +115545,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91834,6 +115555,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91852,33 +115574,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 588 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + SolutionIndex: 730 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -91896,43 +115616,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91945,9 +115661,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91955,13 +115671,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91971,13 +115689,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91987,6 +115706,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91996,6 +115716,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92014,33 +115735,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 589 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 731 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92052,14 +115771,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -92072,29 +115791,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13568 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92105,11 +115820,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92118,11 +115833,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -92133,13 +115850,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92149,6 +115867,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92158,6 +115877,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92176,16 +115896,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 590 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 + SolutionIndex: 732 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -92196,13 +115916,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92214,13 +115932,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -92234,29 +115952,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92267,7 +115981,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -92280,11 +115994,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -92295,13 +116009,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92311,6 +116026,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92320,6 +116036,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92338,8 +116055,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 591 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 733 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92347,7 +116064,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -92358,11 +116075,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -92376,14 +116093,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -92393,32 +116110,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 13568 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92429,11 +116142,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92441,13 +116154,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92457,13 +116172,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92473,6 +116189,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92482,6 +116199,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92500,33 +116218,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 592 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM1 + SolutionIndex: 734 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92538,13 +116254,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -92558,29 +116274,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92591,11 +116303,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92603,13 +116315,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92619,13 +116331,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92635,6 +116348,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92644,6 +116358,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92662,31 +116377,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 593 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 735 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -92700,14 +116415,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -92720,29 +116435,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92753,11 +116464,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92765,13 +116476,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92781,13 +116494,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92797,6 +116511,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92806,6 +116521,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92824,33 +116540,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 594 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 736 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92862,13 +116576,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -92882,29 +116596,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92915,11 +116625,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92927,13 +116637,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92943,13 +116653,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92959,6 +116670,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92968,6 +116680,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92986,31 +116699,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 595 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 737 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -93024,14 +116737,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -93044,29 +116757,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93077,11 +116786,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93089,12 +116798,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -93105,13 +116816,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93121,6 +116833,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -93130,6 +116843,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -93148,8 +116862,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 596 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 738 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -93157,24 +116871,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93192,10 +116904,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -93206,7 +116918,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -93214,21 +116926,17 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93241,9 +116949,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93251,8 +116959,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -93267,13 +116975,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93293,8 +117002,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -93312,8 +117021,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 597 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 739 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -93321,20 +117030,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -93356,8 +117065,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -93370,29 +117079,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93404,10 +117109,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93415,13 +117120,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -93431,13 +117138,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93457,8 +117165,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -93476,33 +117184,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 598 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 740 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93514,16 +117220,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -93539,39 +117245,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93579,8 +117281,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -93595,13 +117297,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93621,8 +117324,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -93640,31 +117343,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 599 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM1 + SolutionIndex: 741 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -93678,14 +117381,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -93698,29 +117401,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 2 - LVCB: 2 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93731,7 +117430,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -93743,13 +117442,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -93759,13 +117460,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93785,8 +117487,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -93804,33 +117506,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 600 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM8 + SolutionIndex: 742 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 1] + VectorWidth: 1 + WorkGroup: [8, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93842,16 +117542,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -93862,44 +117562,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 4 + LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 + LoopUnroll: 32 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93907,12 +117603,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -93923,13 +117621,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93949,8 +117648,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -93968,33 +117667,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 601 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW2_WG4_4_8_WGM8 + SolutionIndex: 743 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94006,13 +117703,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 @@ -94026,44 +117723,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 864 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94071,8 +117764,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -94087,13 +117780,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94113,8 +117807,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -94132,31 +117826,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 602 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM8 + SolutionIndex: 744 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -94170,16 +117864,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94190,44 +117884,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94235,11 +117925,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -94251,13 +117943,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94277,8 +117970,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -94296,33 +117989,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 603 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM16 + SolutionIndex: 745 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94334,13 +118025,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -94354,29 +118045,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 2 - LVCB: 2 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -94387,10 +118074,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -94400,12 +118087,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -94415,13 +118102,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94441,8 +118129,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -94460,16 +118148,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 604 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM64 + SolutionIndex: 746 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -94480,11 +118168,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -94498,16 +118186,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94518,44 +118206,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94563,12 +118247,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -94579,13 +118263,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94605,8 +118290,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -94624,31 +118309,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 605 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM64 + SolutionIndex: 747 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -94662,13 +118347,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -94679,47 +118364,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94727,8 +118408,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -94743,13 +118424,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94769,8 +118451,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -94788,37 +118470,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 606 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT8_4_VW2_WG4_4_8_WGM64 + SolutionIndex: 748 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -94826,7 +118508,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94835,7 +118517,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94846,44 +118528,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 8 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94891,13 +118574,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -94907,6 +118590,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -94914,6 +118598,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94933,8 +118618,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -94952,37 +118637,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 607 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM1 + SolutionIndex: 749 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT6_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 2 - SubGroupA: 8 - SubGroupB: 2 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 2, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 32 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -94990,15 +118675,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -95006,7 +118691,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -95014,23 +118699,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -95043,11 +118729,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95055,12 +118741,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -95071,6 +118759,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -95078,6 +118767,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95097,8 +118787,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -95116,8 +118806,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 608 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 750 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -95125,12 +118815,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -95138,15 +118828,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95154,14 +118842,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -95178,23 +118866,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -95207,11 +118896,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95219,13 +118908,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95235,13 +118926,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95261,8 +118954,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -95280,37 +118973,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 609 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM1 + SolutionIndex: 751 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95327,7 +119018,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -95338,43 +119029,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 4 - LVPB: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -95384,12 +119076,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95399,6 +119091,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -95406,6 +119099,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95425,8 +119119,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -95444,29 +119138,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 610 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM1 + SolutionIndex: 752 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 2 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [2, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -95474,7 +119168,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95488,9 +119182,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -95506,35 +119200,40 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 16 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 - LVPA: 16 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -95543,8 +119242,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -95559,13 +119258,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95585,8 +119286,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -95604,14 +119305,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 611 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 753 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -95625,8 +119326,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -95634,7 +119335,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95642,39 +119343,40 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -95688,18 +119390,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95707,13 +119409,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95723,6 +119428,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -95730,6 +119436,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95749,8 +119456,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -95768,16 +119475,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 612 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM1 + SolutionIndex: 754 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -95789,16 +119496,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 4, 8] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95813,8 +119518,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -95822,7 +119527,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -95830,40 +119535,41 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 8 - LVCA: 2 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95871,13 +119577,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95887,6 +119596,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -95894,6 +119604,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95913,8 +119624,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -95932,16 +119643,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 613 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM8 + SolutionIndex: 755 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 2 - SubGroupA: 8 - SubGroupB: 2 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -95953,16 +119664,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 2, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95970,14 +119679,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -95990,29 +119699,26 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -96023,11 +119729,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96036,12 +119742,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96051,13 +119760,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96077,8 +119788,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -96096,16 +119807,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 614 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM8 + SolutionIndex: 756 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -96116,17 +119827,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96134,14 +119843,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -96151,47 +119860,44 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96199,13 +119905,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96215,13 +119924,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96241,8 +119952,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -96260,15 +119971,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 615 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT4_4_VW4_WG4_4_8_WGM8 + SolutionIndex: 757 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -96281,16 +119992,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 4, 8] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96322,6 +120031,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -96370,6 +120080,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96379,6 +120090,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -96386,6 +120098,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96405,8 +120118,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -96424,8 +120137,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 616 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 758 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -96433,7 +120146,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -96446,7 +120159,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -96454,7 +120167,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96462,13 +120175,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -96486,25 +120199,22 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -96515,11 +120225,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96533,7 +120243,8 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96543,13 +120254,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96569,8 +120282,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -96588,16 +120301,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 617 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM64 + SolutionIndex: 759 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -96609,16 +120322,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96635,7 +120348,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96650,39 +120363,40 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 4 - LVPB: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -96696,8 +120410,9 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96707,6 +120422,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -96714,6 +120430,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96733,8 +120450,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -96752,20 +120469,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 618 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM64 + SolutionIndex: 760 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 2 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -96773,8 +120490,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [2, 8, 4] - WorkGroupMapping: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -96782,7 +120499,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96790,64 +120507,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96855,13 +120573,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96871,6 +120592,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -96878,6 +120600,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96897,8 +120620,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -96916,37 +120639,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 619 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG4_4_8_WGM64 + SolutionIndex: 761 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_LPA0_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 4 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96961,8 +120682,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -96974,44 +120695,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97019,13 +120741,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97035,6 +120760,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97042,6 +120768,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -97061,8 +120788,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -97080,37 +120807,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 620 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM64 + SolutionIndex: 762 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 4 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97118,64 +120843,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97183,13 +120909,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97199,6 +120928,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97226,8 +120956,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -97245,37 +120975,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 621 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 763 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97283,7 +121011,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -97292,7 +121020,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -97303,44 +121031,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97348,15 +121077,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97366,6 +121096,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97393,8 +121124,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -97412,35 +121143,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 622 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 764 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97455,57 +121186,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 8 LVPA: 16 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97513,15 +121245,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97531,6 +121262,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97558,8 +121290,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -97577,35 +121309,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 623 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 765 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x16_SE_EPS1_FL1_GRVW2_LPA0_LPB0_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97620,9 +121354,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -97637,40 +121371,41 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97678,13 +121413,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97694,6 +121432,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97721,8 +121460,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -97740,37 +121479,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 624 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 766 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97778,7 +121515,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -97787,7 +121524,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -97802,23 +121539,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 96 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 24 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -97831,11 +121569,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97843,13 +121581,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97859,6 +121598,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97886,8 +121626,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -97905,8 +121645,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 625 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 767 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97914,12 +121654,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -97929,13 +121669,13 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97943,7 +121683,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -97951,7 +121691,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -97967,23 +121707,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -97996,10 +121737,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -98008,15 +121749,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98026,8 +121768,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -98053,8 +121796,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -98072,8 +121815,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 626 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 768 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98081,14 +121824,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -98096,11 +121839,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98108,7 +121851,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -98116,7 +121859,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -98132,23 +121875,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98161,10 +121905,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -98173,15 +121917,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98191,6 +121936,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98218,8 +121964,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -98237,8 +121983,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 627 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 769 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98246,11 +121992,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -98261,11 +122007,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98273,16 +122019,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -98297,23 +122043,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98326,11 +122073,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98338,13 +122085,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98354,8 +122104,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -98381,8 +122132,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -98400,8 +122151,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 628 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 770 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98409,28 +122160,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98445,7 +122194,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -98462,6 +122211,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -98505,11 +122255,14 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98519,6 +122272,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98546,8 +122300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -98565,8 +122319,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 629 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 771 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98574,7 +122328,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 @@ -98590,12 +122344,10 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98603,14 +122355,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -98627,23 +122379,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98656,7 +122409,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 128 MacroTileA: 128 @@ -98670,13 +122423,12 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98686,6 +122438,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98713,8 +122466,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -98732,8 +122485,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 630 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 772 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98741,26 +122494,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98792,6 +122547,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -98802,13 +122558,13 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98822,10 +122578,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98833,15 +122589,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98851,6 +122608,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98878,8 +122636,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -98897,8 +122655,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 631 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 773 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98906,12 +122664,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -98925,7 +122683,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98933,7 +122691,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -98942,7 +122700,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -98957,23 +122715,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98986,11 +122745,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98998,13 +122757,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99014,6 +122774,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99041,8 +122802,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -99060,8 +122821,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 632 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 774 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -99069,28 +122830,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99106,7 +122867,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -99122,40 +122883,41 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99163,13 +122925,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99179,6 +122942,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99206,8 +122970,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -99225,29 +122989,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 633 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 775 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -99255,7 +123019,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99283,10 +123047,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -99297,15 +123062,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99318,25 +123083,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99346,6 +123114,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99373,9 +123142,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -99392,8 +123162,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 634 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 776 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -99401,18 +123171,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -99420,7 +123190,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99452,6 +123222,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -99462,15 +123233,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99483,25 +123254,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99511,6 +123285,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99538,9 +123313,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -99557,8 +123333,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 635 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 777 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -99566,12 +123342,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -99585,7 +123361,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99600,16 +123376,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -99617,25 +123393,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99648,23 +123425,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99674,6 +123456,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99701,9 +123484,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -99720,8 +123504,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 636 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 778 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -99729,14 +123513,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -99745,12 +123529,10 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99765,16 +123547,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -99782,25 +123564,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99813,23 +123596,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99839,6 +123627,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99866,9 +123655,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -99885,8 +123675,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 637 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 779 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -99894,12 +123684,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -99907,15 +123697,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99931,41 +123719,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7200 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99978,23 +123767,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100004,6 +123796,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100031,9 +123824,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -100050,8 +123844,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 638 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 780 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -100059,18 +123853,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -100080,7 +123874,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100095,16 +123889,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -100112,25 +123906,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100143,25 +123938,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100171,6 +123967,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100198,9 +123995,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -100217,8 +124015,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 639 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 781 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -100226,12 +124024,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -100239,13 +124037,15 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100260,16 +124060,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -100277,25 +124077,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100308,25 +124109,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100336,6 +124138,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100363,9 +124166,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -100382,8 +124186,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 640 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 782 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -100391,14 +124195,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -100407,10 +124211,12 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100424,17 +124230,17 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -100442,25 +124248,22 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100473,25 +124276,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100501,7 +124305,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -100528,9 +124333,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -100547,8 +124353,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 641 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 783 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_PGR0_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -100556,12 +124362,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -100569,13 +124375,15 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100607,6 +124415,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -100617,15 +124426,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100638,25 +124447,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100666,6 +124478,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100693,9 +124506,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -100712,8 +124526,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 642 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 784 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -100721,14 +124535,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -100740,7 +124554,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100772,6 +124586,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -100782,15 +124597,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100803,25 +124618,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100831,6 +124649,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100858,9 +124677,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -100877,8 +124697,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 643 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 785 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -100886,12 +124706,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -100899,13 +124719,13 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100913,49 +124733,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100966,7 +124787,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -100974,17 +124795,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100994,8 +124820,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -101021,9 +124848,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -101040,8 +124868,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 644 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 786 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101049,28 +124877,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101085,42 +124911,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101139,17 +124966,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101159,8 +124991,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -101186,9 +125019,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -101205,8 +125039,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 645 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 787 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101214,7 +125048,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 @@ -101225,17 +125059,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101251,15 +125083,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -101267,25 +125099,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101304,19 +125137,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101326,6 +125162,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101353,9 +125190,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -101372,8 +125210,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 646 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 788 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101381,14 +125219,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -101400,7 +125238,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101416,15 +125254,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -101432,25 +125270,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101469,19 +125308,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101491,6 +125333,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101518,9 +125361,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -101537,8 +125381,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 647 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 789 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101546,7 +125390,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 @@ -101559,13 +125403,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101573,7 +125417,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -101597,25 +125441,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101626,7 +125471,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -101634,7 +125479,9 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -101642,9 +125489,10 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101654,6 +125502,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101681,9 +125530,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -101700,8 +125550,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 648 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 790 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101709,14 +125559,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -101724,13 +125574,13 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101746,41 +125596,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101799,17 +125650,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101819,6 +125673,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101846,9 +125701,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -101865,8 +125721,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 649 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 791 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101874,7 +125730,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 @@ -101885,9 +125741,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -101895,7 +125751,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101903,23 +125759,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -101927,25 +125783,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101956,27 +125813,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101986,8 +125844,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -102013,9 +125872,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -102032,8 +125892,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 650 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 792 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -102041,26 +125901,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102075,16 +125937,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -102092,25 +125954,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102122,26 +125985,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102151,6 +126015,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102178,9 +126043,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -102197,8 +126063,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 651 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 793 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -102206,12 +126072,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -102222,10 +126088,12 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102241,15 +126109,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -102257,25 +126125,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102287,14 +126156,16 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -102302,9 +126173,10 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102314,6 +126186,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102341,9 +126214,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -102360,8 +126234,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 652 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 794 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -102369,20 +126243,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -102390,7 +126264,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102398,7 +126272,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102422,25 +126296,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102451,7 +126326,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -102459,19 +126334,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102481,8 +126359,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -102508,9 +126387,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -102527,8 +126407,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 653 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 795 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -102536,7 +126416,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -102549,13 +126429,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102563,14 +126443,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -102587,25 +126467,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102616,7 +126497,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -102624,17 +126505,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102644,8 +126530,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -102671,9 +126558,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -102690,8 +126578,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 654 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 796 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -102699,7 +126587,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -102714,13 +126602,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102728,7 +126614,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102737,7 +126623,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -102748,29 +126634,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102781,27 +126668,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102811,8 +126701,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -102838,9 +126729,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -102857,8 +126749,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 655 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 797 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -102866,26 +126758,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102893,16 +126785,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -102917,25 +126809,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102946,25 +126839,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102974,8 +126872,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -103001,9 +126900,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -103020,8 +126920,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 656 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 798 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -103029,12 +126929,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -103042,15 +126942,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103064,8 +126962,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -103078,24 +126976,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -103108,24 +127011,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103135,7 +127043,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103162,9 +127071,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -103181,37 +127091,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 657 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB0_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 799 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103219,45 +127127,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -103268,27 +127181,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103298,8 +127212,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -103325,9 +127240,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -103344,35 +127260,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 658 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 800 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103380,45 +127298,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -103429,27 +127352,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103459,8 +127383,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -103486,9 +127411,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -103505,35 +127431,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 659 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 801 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103541,45 +127469,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -103590,27 +127523,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103620,7 +127554,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103647,9 +127582,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -103666,8 +127602,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 660 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 802 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -103676,25 +127612,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103702,16 +127640,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103722,25 +127660,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -103751,25 +127694,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103779,8 +127725,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -103806,9 +127753,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -103825,8 +127773,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 661 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 803 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -103835,27 +127783,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103863,16 +127811,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103883,25 +127831,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -103912,27 +127865,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103942,8 +127896,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -103969,9 +127924,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -103988,35 +127944,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 662 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 804 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104024,16 +127982,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -104044,25 +128002,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104073,25 +128036,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104101,8 +128067,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104128,9 +128095,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -104147,37 +128115,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 663 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 805 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104185,16 +128153,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -104205,25 +128173,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104234,27 +128207,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104264,7 +128238,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104291,9 +128266,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -104310,35 +128286,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 664 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 + SolutionIndex: 806 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104352,39 +128330,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104396,24 +128379,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104423,8 +128411,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104450,9 +128439,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -104469,37 +128459,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 665 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 + SolutionIndex: 807 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104513,39 +128501,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104557,26 +128550,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104586,8 +128582,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104613,9 +128610,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -104632,8 +128630,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 666 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 808 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -104642,17 +128640,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -104660,7 +128658,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104674,39 +128672,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104718,24 +128721,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104745,8 +128753,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104772,9 +128781,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -104791,8 +128801,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 667 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 809 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -104801,27 +128811,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104835,39 +128843,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104879,26 +128892,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104908,7 +128924,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104935,9 +128952,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -104954,35 +128972,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 668 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 810 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104996,39 +129014,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105040,24 +129063,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105067,7 +129095,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105094,9 +129123,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -105113,37 +129143,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 669 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 811 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105157,7 +129185,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -105171,25 +129199,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105201,26 +129234,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105230,7 +129266,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105257,9 +129294,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -105276,35 +129314,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 670 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM8 + SolutionIndex: 812 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105312,13 +129350,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -105332,25 +129370,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2144 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105361,27 +129404,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105391,7 +129437,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105418,9 +129465,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -105437,35 +129485,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 671 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 813 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105473,15 +129521,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -105489,29 +129537,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 + LSCA: 16 + LSCB: 16 + LSPA: 32 LSPB: 32 - LVCA: 2 - LVCB: 4 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 864 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105522,25 +129575,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105550,7 +129606,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -105577,9 +129634,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -105596,37 +129654,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 672 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 814 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105640,39 +129698,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105684,26 +129747,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105713,7 +129777,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105740,9 +129805,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -105759,35 +129825,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 673 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 815 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105801,17 +129869,17 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -105819,21 +129887,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105845,24 +129918,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105872,8 +129948,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -105899,9 +129976,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -105918,29 +129996,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 674 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 816 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -105948,7 +130026,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105962,17 +130040,17 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -105980,21 +130058,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -106006,24 +130089,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106033,7 +130119,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106060,9 +130147,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106079,28 +130167,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 675 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 + SolutionIndex: 817 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -106109,7 +130197,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -106123,17 +130211,17 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -106141,21 +130229,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -106167,24 +130260,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106194,7 +130290,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106221,9 +130318,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106240,28 +130338,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 676 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 818 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -106313,13 +130411,13 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -106333,24 +130431,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106362,7 +130463,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -106388,9 +130489,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106407,8 +130509,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 677 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT6_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 819 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -106417,11 +130519,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -106429,10 +130531,10 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 32 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -106452,16 +130554,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -106473,10 +130575,10 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false @@ -106507,19 +130609,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106531,7 +130634,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -106557,9 +130660,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106576,8 +130680,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 678 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 820 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -106601,6 +130705,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106619,7 +130725,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106674,19 +130780,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106698,7 +130805,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -106724,9 +130831,181 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 821 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106743,8 +131022,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 679 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 822 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -106768,6 +131047,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106786,9 +131067,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106799,7 +131080,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -106807,20 +131088,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -106834,24 +131115,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106889,9 +131175,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106908,15 +131195,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 680 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 + SolutionIndex: 823 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -106928,13 +131215,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106953,7 +131238,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106974,51 +131259,56 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 + LSPA: 32 LSPB: 64 LVCA: 8 LVCB: 4 - LVPA: 8 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -107056,9 +131346,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -107075,14 +131366,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 681 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG4_16_4_WGM8 + SolutionIndex: 824 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -107096,12 +131387,181 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 16, 4] + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 825 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [1024, 128, 1, 128] - [12, 896.219] @@ -110025,8 +134485,6 @@ - [231, 6307.6] - - [1024, 512, 1, 4608] - [242, 7953.38] - - - [2048, 256, 1, 768] - - [242, 7059.14] - - [4096, 200, 1, 32] - [191, 2199.19] - - [4096, 200, 1, 3328] @@ -112697,5598 +137155,6108 @@ - [336, 6145.5] - - [1024, 3712, 1, 1024] - [338, 8933.88] + - - [256, 256, 192, 64] + - [343, 8264.64] + - - [768, 4096, 1, 768] + - [356, 9642.08] + - - [768, 64, 1, 768] + - [353, 1850.43] + - - [768, 1280, 1, 768] + - [356, 8738.13] + - - [30522, 320, 1, 768] + - [357, 9733.59] + - - [128, 128, 96, 64] + - [346, 5470.83] + - - [2, 16, 1, 768] + - [349, 2.47742] + - - [30522, 1280, 1, 768] + - [355, 10127.9] + - - [30522, 640, 1, 768] + - [356, 9987.61] + - - [2, 8, 1, 768] + - [348, 0.96] + - - [768, 4096, 1, 3072] + - [358, 9479.41] + - - [768, 32, 1, 768] + - [352, 880.334] + - - [2, 64, 1, 768] + - [349, 9.99024] + - - [256, 256, 96, 64] + - [343, 7614.47] + - - [64, 64, 768, 64] + - [345, 5354.43] + - - [30522, 160, 1, 768] + - [354, 7740.11] + - - [768, 320, 1, 768] + - [347, 5423.67] + - - [128, 128, 384, 64] + - [344, 7179.98] + - - [768, 16, 1, 768] + - [350, 706.376] + - - [3072, 4096, 1, 768] + - [359, 9961.74] + - - [2048, 512, 1, 100] + - [361, 5180.71] + - - [1024, 200, 1, 560] + - [362, 4061.19] + - - [256, 1280, 1, 1024] + - [369, 4337.44] + - - [256, 44505, 1, 1024] + - [405, 8597.69] + - - [10240, 8976, 1, 256] + - [408, 9471.43] + - - [256, 7168, 1, 1024] + - [399, 6718.56] + - - [8448, 8976, 1, 256] + - [391, 9601.31] + - - [18944, 8976, 1, 256] + - [400, 9666.26] + - - [256, 19200, 1, 1024] + - [376, 7488.94] + - - [5632, 8976, 1, 256] + - [388, 9358.39] + - - [256, 23552, 1, 1024] + - [403, 7980.89] + - - [256, 6656, 1, 1024] + - [403, 6287.22] + - - [256, 14336, 1, 1024] + - [398, 7049.26] + - - [256, 12544, 1, 1024] + - [376, 6728.47] + - - [2048, 684, 1, 768] + - [393, 8479.18] + - - [5376, 8976, 1, 256] + - [388, 9519.51] + - - [256, 5888, 1, 1024] + - [408, 6012.4] + - - [19968, 8976, 1, 256] + - [400, 9684.67] + - - [3840, 8976, 1, 256] + - [385, 9461.89] + - - [4608, 8976, 1, 256] + - [385, 9305.82] + - - [256, 684, 1, 1024] + - [411, 3513.06] + - - [256, 22016, 1, 1024] + - [376, 7643.79] + - - [256, 23296, 1, 1024] + - [405, 8048.12] + - - [4864, 8976, 1, 256] + - [383, 9545.62] + - - [256, 7424, 1, 1024] + - [401, 6770.65] + - - [18176, 8976, 1, 256] + - [408, 9729.47] + - - [256, 15104, 1, 1024] + - [397, 7289.08] + - - [8192, 8976, 1, 256] + - [400, 9395.49] + - - [256, 16128, 1, 1024] + - [400, 7461.28] + - - [13312, 8976, 1, 256] + - [408, 9550.97] + - - [256, 21504, 1, 1024] + - [405, 7635.93] + - - [6400, 8976, 1, 256] + - [392, 9560.96] + - - [256, 8960, 1, 1024] + - [367, 6292.36] + - - [1792, 8976, 1, 256] + - [382, 9372.18] + - - [13824, 8976, 1, 256] + - [400, 9585.27] + - - [11776, 8976, 1, 256] + - [400, 9560.34] + - - [256, 20992, 1, 1024] + - [398, 7490.65] + - - [20480, 8976, 1, 256] + - [408, 9610.7] + - - [5888, 8976, 1, 256] + - [379, 9565.2] + - - [256, 10496, 1, 1024] + - [370, 6631.96] + - - [21248, 8976, 1, 256] + - [400, 9755.77] + - - [5120, 8976, 1, 256] + - [408, 9244.59] + - - [7168, 8976, 1, 256] + - [400, 9388.42] + - - [2048, 1536, 1, 768] + - [389, 9446.04] + - - [256, 8192, 1, 1024] + - [394, 6948.89] + - - [4096, 8976, 1, 256] + - [399, 9115.94] + - - [3328, 8976, 1, 256] + - [392, 9434.55] + - - [1280, 8976, 1, 256] + - [390, 9129.8] + - - [2560, 8976, 1, 256] + - [387, 9199.48] + - - [3072, 8976, 1, 256] + - [402, 8963.6] + - - [256, 11776, 1, 1024] + - [380, 6869.8] + - - [18688, 8976, 1, 256] + - [408, 9726.21] + - - [15104, 8976, 1, 256] + - [408, 9715.71] + - - [23552, 8976, 1, 256] + - [400, 9648.42] + - - [6144, 8976, 1, 256] + - [408, 9339.8] + - - [12544, 8976, 1, 256] + - [408, 9654.45] + - - [256, 11264, 1, 1024] + - [381, 6814.98] + - - [2048, 114, 1, 512] + - [412, 4583.5] + - - [4352, 8976, 1, 256] + - [392, 9471.4] + - - [15360, 8976, 1, 256] + - [408, 9583.77] + - - [256, 31488, 1, 1024] + - [407, 8438.01] + - - [28672, 8976, 1, 256] + - [400, 9688.85] + - - [256, 18176, 1, 1024] + - [376, 7405.09] + - - [9728, 8976, 1, 256] + - [408, 9524.15] + - - [256, 2816, 1, 1024] + - [372, 5405.66] + - - [256, 18944, 1, 1024] + - [376, 7503.41] + - - [256, 3584, 1, 1024] + - [375, 6107.15] + - - [7936, 8976, 1, 256] + - [388, 9608.31] + - - [19712, 8976, 1, 256] + - [408, 9736.25] + - - [256, 14848, 1, 1024] + - [381, 7163.42] + - - [256, 8448, 1, 1024] + - [381, 6372.56] + - - [256, 6400, 1, 1024] + - [395, 6395.71] + - - [256, 6144, 1, 1024] + - [406, 6490.22] + - - [9472, 8976, 1, 256] + - [385, 9609.92] + - - [256, 9984, 1, 1024] + - [368, 6484.75] + - - [684, 8976, 1, 256] + - [377, 8128.53] + - - [20992, 8976, 1, 256] + - [400, 9689.65] + - - [2048, 684, 1, 512] + - [384, 7241.78] + - - [2048, 114, 1, 768] + - [410, 4872.46] + - - [8960, 8976, 1, 256] + - [383, 9603.35] + - - [2048, 1536, 1, 512] + - [386, 8830.11] + - - [256, 3328, 1, 1024] + - [374, 5612.55] + - - [33536, 8976, 1, 256] + - [400, 9797.71] + - - [2048, 8976, 1, 256] + - [400, 8975.46] + - - [10496, 8976, 1, 256] + - [391, 9654.43] + - - [256, 5376, 1, 1024] + - [409, 5626.34] + - - [256, 21248, 1, 1024] + - [378, 7525.45] + - - [256, 13312, 1, 1024] + - [376, 6767.11] + - - [16128, 8976, 1, 256] + - [400, 9715.57] + - - [2304, 8976, 1, 256] + - [373, 9433.83] + - - [256, 4864, 1, 1024] + - [363, 5743.55] + - - [17152, 8976, 1, 256] + - [408, 9708.94] + - - [15872, 8976, 1, 256] + - [408, 9657.57] + - - [9984, 8976, 1, 256] + - [385, 9639.74] + - - [256, 14592, 1, 1024] + - [397, 7223.92] + - - [256, 33536, 1, 1024] + - [404, 8147.31] + - - [11264, 8976, 1, 256] + - [400, 9509.96] + - - [31488, 8976, 1, 256] + - [408, 9799.31] + - - [256, 20480, 1, 1024] + - [381, 7498.2] + - - [44505, 8976, 1, 256] + - [392, 9804.78] + - - [13568, 8976, 1, 256] + - [400, 9680.24] + - - [256, 11520, 1, 1024] + - [380, 6805.26] + - - [256, 7936, 1, 1024] + - [396, 6971.77] + - - [2048, 256, 1, 768] + - [366, 7129.13] + - - [256, 4608, 1, 1024] + - [364, 5462.91] + - - [256, 2304, 1, 1024] + - [371, 4842.69] + - - [256, 2560, 1, 1024] + - [372, 5309.25] + - - [2816, 8976, 1, 256] + - [383, 9409.56] - - [1024, 128, 1, 128] - - [353, 896.319] + - [425, 896.319] - - [4, 704, 1, 1280] - - [390, 328.976] + - [462, 328.976] - - [4, 1856, 1, 3328] - - [400, 501.461] + - [472, 501.461] - - [1856, 448, 1, 3328] - - [445, 5678.01] + - [517, 5678.01] - - [2944, 4288, 1, 1280] - - [431, 8412.49] + - [503, 8412.49] - - [2368, 64, 1, 3328] - - [381, 4914.02] + - [453, 4914.02] - - [1760, 32, 1, 1760] - - [408, 3313.04] + - [480, 3313.04] - - [2368, 5888, 1, 256] - - [431, 6489.82] + - [503, 6489.82] - - [5888, 1856, 1, 256] - - [443, 7791.98] + - [515, 7791.98] - - [128, 64, 1, 256] - - [415, 369.317] + - [487, 369.317] - - [512, 24000, 1, 1536] - - [437, 8827.47] + - [509, 8827.47] - - [128, 6784, 1, 3328] - - [437, 6537.09] + - [509, 6537.09] - - [5888, 1408, 1, 256] - - [451, 6129.71] + - [523, 6129.71] - - [5888, 1856, 1, 3328] - - [437, 7969.27] + - [509, 7969.27] - - [5056, 704, 1, 256] - - [437, 6723.92] + - [509, 6723.92] - - [2048, 400, 1, 512] - - [443, 4531.54] + - [515, 4531.54] - - [5888, 2944, 1, 3328] - - [443, 8608.14] + - [515, 8608.14] - - [1856, 4288, 1, 256] - - [443, 6297.64] + - [515, 6297.64] - - [1024, 5056, 1, 128] - - [421, 3595.47] + - [493, 3595.47] - - [5056, 5056, 1, 3328] - - [437, 8559.26] + - [509, 8559.26] - - [1408, 5888, 1, 1280] - - [432, 6797.16] + - [504, 6797.16] - - [2368, 448, 1, 128] - - [421, 2815.0] + - [493, 2815.0] - - [2368, 6784, 1, 128] - - [425, 4782.08] + - [497, 4782.08] - - [1024, 3584, 1, 3328] - - [433, 8402.54] + - [505, 8402.54] - - [512, 48000, 1, 2048] - - [437, 8162.33] + - [509, 8162.33] - - [128, 448, 1, 1280] - - [408, 2903.59] + - [480, 2903.59] - - [256, 4288, 1, 3328] - - [438, 6346.04] + - [510, 6346.04] - - [5888, 1408, 1, 1280] - - [437, 8959.55] + - [509, 8959.55] - - [704, 1856, 1, 3328] - - [432, 6955.37] + - [504, 6955.37] - - [4, 1408, 1, 128] - - [452, 60.1747] + - [524, 60.1747] - - [1024, 2368, 1, 256] - - [439, 5927.88] + - [511, 5927.88] - - [64, 4, 1, 256] - - [457, 13.3129] + - [529, 13.3129] - - [1408, 1856, 1, 1280] - - [435, 8051.68] + - [507, 8051.68] - - [1408, 64, 1, 1280] - - [411, 3400.55] + - [483, 3400.55] - - [448, 1024, 1, 1280] - - [439, 5730.02] + - [511, 5730.02] - - [6144, 24000, 1, 2048] - - [443, 7738.4] + - [515, 7738.4] - - [4096, 32, 1, 4096] - - [381, 2381.53] + - [453, 2381.53] - - [256, 1408, 1, 3328] - - [439, 4844.88] + - [511, 4844.88] - - [5056, 5056, 1, 1280] - - [443, 9090.2] + - [515, 9090.2] - - [448, 5056, 1, 256] - - [449, 4961.28] + - [521, 4961.28] - - [704, 1856, 1, 1280] - - [435, 6456.54] + - [507, 6456.54] - - [128, 5056, 1, 128] - - [364, 2251.12] + - [436, 2251.12] - - [2368, 128, 1, 256] - - [432, 3403.37] + - [504, 3403.37] - - [1760, 6400, 1, 1760] - - [431, 8959.8] + - [503, 8959.8] - - [1856, 1408, 1, 128] - - [424, 3493.16] + - [496, 3493.16] - - [64, 5056, 1, 256] - - [433, 2582.32] + - [505, 2582.32] - - [6784, 256, 1, 3328] - - [431, 7323.64] + - [503, 7323.64] - - [6784, 4288, 1, 3328] - - [433, 8542.19] + - [505, 8542.19] - - [4288, 448, 1, 256] - - [449, 5030.6] + - [521, 5030.6] - - [64, 704, 1, 128] - - [366, 375.567] + - [438, 375.567] - - [1856, 2368, 1, 3328] - - [442, 6742.44] + - [514, 6742.44] - - [4288, 2944, 1, 1280] - - [443, 8578.27] + - [515, 8578.27] - - [704, 5056, 1, 1280] - - [439, 8014.55] + - [511, 8014.55] - - [2368, 704, 1, 3328] - - [438, 6544.41] + - [510, 6544.41] - - [256, 5888, 1, 256] - - [436, 5933.0] + - [508, 5933.0] - - [1856, 4288, 1, 3328] - - [442, 7410.82] + - [514, 7410.82] - - [256, 2944, 1, 256] - - [438, 5014.08] + - [510, 5014.08] - - [5888, 1024, 1, 256] - - [443, 8069.44] + - [515, 8069.44] - - [448, 64, 1, 1280] - - [418, 2057.28] + - [490, 2057.28] - - [3072, 64, 1, 1024] - - [398, 2145.52] + - [470, 2145.52] - - [3584, 4, 1, 1280] - - [390, 498.743] + - [462, 498.743] - - [16384, 3200, 1, 4096] - - [430, 6621.53] + - [502, 6621.53] - - [2944, 64, 1, 256] - - [438, 2554.89] + - [510, 2554.89] - - [128, 4, 1, 1280] - - [400, 87.2489] + - [472, 87.2489] - - [1408, 2944, 1, 256] - - [437, 8029.45] + - [509, 8029.45] - - [256, 1856, 1, 1280] - - [432, 6170.7] + - [504, 6170.7] - - [6784, 5056, 1, 3328] - - [441, 7134.29] + - [513, 7134.29] - - [5056, 5056, 1, 256] - - [449, 6246.9] + - [521, 6246.9] - - [1408, 6784, 1, 128] - - [426, 4329.55] + - [498, 4329.55] - - [64, 1024, 1, 1280] - - [408, 3206.75] + - [480, 3206.75] - - [2944, 4, 1, 256] - - [457, 333.58] + - [529, 333.58] - - [704, 5056, 1, 128] - - [421, 4085.52] + - [493, 4085.52] - - [4, 2368, 1, 1280] - - [458, 394.767] + - [530, 394.767] - - [2368, 2944, 1, 1280] - - [437, 8634.05] + - [509, 8634.05] - - [128, 3584, 1, 1280] - - [438, 6046.25] + - [510, 6046.25] - - [6784, 6784, 1, 1280] - - [443, 8847.51] + - [515, 8847.51] - - [1408, 4288, 1, 1280] - - [443, 8236.79] + - [515, 8236.79] - - [3584, 4288, 1, 1280] - - [438, 7399.98] + - [510, 7399.98] - - [2368, 704, 1, 1280] - - [431, 6754.5] + - [503, 6754.5] - - [5056, 4288, 1, 3328] - - [437, 8569.63] + - [509, 8569.63] - - [3584, 2368, 1, 3328] - - [442, 7942.48] + - [514, 7942.48] - - [64, 704, 1, 1280] - - [411, 2363.69] + - [483, 2363.69] - - [4288, 256, 1, 256] - - [439, 4591.9] + - [511, 4591.9] - - [2944, 128, 1, 128] - - [364, 1878.39] + - [436, 1878.39] - - [6144, 32, 1, 2560] - - [409, 3334.2] + - [481, 3334.2] - - [6784, 448, 1, 1280] - - [441, 7939.3] + - [513, 7939.3] - - [1408, 2944, 1, 128] - - [425, 4096.61] + - [497, 4096.61] - - [4288, 2944, 1, 256] - - [431, 8141.23] + - [503, 8141.23] - - [5888, 704, 1, 1280] - - [432, 7516.23] + - [504, 7516.23] - - [5056, 4, 1, 3328] - - [375, 552.509] + - [447, 552.509] - - [1856, 64, 1, 1280] - - [381, 3870.86] + - [453, 3870.86] - - [1760, 16, 1, 1760] - - [393, 2181.51] + - [465, 2181.51] - - [448, 5888, 1, 128] - - [426, 3371.1] + - [498, 3371.1] - - [5888, 64, 1, 3328] - - [406, 5319.48] + - [478, 5319.48] - - [2944, 256, 1, 3328] - - [438, 7122.4] + - [510, 7122.4] - - [1024, 64, 1, 128] - - [353, 595.882] + - [425, 595.882] - - [5056, 2368, 1, 1280] - - [432, 7778.29] + - [504, 7778.29] - - [448, 3584, 1, 1280] - - [437, 6500.62] + - [509, 6500.62] - - [6784, 5888, 1, 256] - - [437, 8918.68] + - [509, 8918.68] - - [704, 1024, 1, 128] - - [421, 2627.51] + - [493, 2627.51] - - [704, 128, 1, 1280] - - [408, 3408.59] + - [480, 3408.59] - - [4, 3584, 1, 128] - - [452, 140.821] + - [524, 140.821] - - [1408, 448, 1, 1280] - - [432, 5881.54] + - [504, 5881.54] - - [1024, 1408, 1, 256] - - [436, 5647.27] + - [508, 5647.27] - - [2368, 2368, 1, 3328] - - [430, 7688.83] + - [502, 7688.83] - - [1856, 6784, 1, 128] - - [421, 4705.95] + - [493, 4705.95] - - [5056, 704, 1, 3328] - - [441, 8198.98] + - [513, 8198.98] - - [1408, 1856, 1, 256] - - [443, 6340.05] + - [515, 6340.05] - - [1408, 704, 1, 3328] - - [435, 7599.65] + - [507, 7599.65] - - [2368, 5056, 1, 256] - - [443, 8242.85] + - [515, 8242.85] - - [1408, 256, 1, 1280] - - [438, 4879.26] + - [510, 4879.26] - - [3072, 128, 1, 1024] - - [407, 2525.52] + - [479, 2525.52] - - [3584, 2368, 1, 1280] - - [439, 8132.72] + - [511, 8132.72] - - [4288, 64, 1, 3328] - - [394, 5156.53] + - [466, 5156.53] - - [2368, 4, 1, 1280] - - [456, 482.75] + - [528, 482.75] - - [704, 5888, 1, 256] - - [446, 5398.75] + - [518, 5398.75] - - [6784, 2944, 1, 128] - - [422, 4748.99] + - [494, 4748.99] - - [2560, 1600, 1, 2560] - - [433, 7355.0] + - [505, 7355.0] - - [4288, 6784, 1, 3328] - - [430, 7409.41] + - [502, 7409.41] - - [2944, 256, 1, 256] - - [438, 5077.42] + - [510, 5077.42] - - [2944, 6784, 1, 3328] - - [443, 8068.05] + - [515, 8068.05] - - [704, 1408, 1, 3328] - - [438, 7239.43] + - [510, 7239.43] - - [6144, 5984, 1, 2048] - - [437, 7176.07] + - [509, 7176.07] - - [3584, 704, 1, 3328] - - [443, 6642.86] + - [515, 6642.86] - - [2944, 256, 1, 128] - - [422, 2644.54] + - [494, 2644.54] - - [6784, 4, 1, 1280] - - [454, 402.487] + - [526, 402.487] - - [1024, 64, 1, 1280] - - [408, 2602.03] + - [480, 2602.03] - - [2048, 1600, 1, 512] - - [435, 5592.5] + - [507, 5592.5] - - [448, 4288, 1, 256] - - [433, 6128.99] + - [505, 6128.99] - - [64, 3584, 1, 3328] - - [374, 5534.93] + - [446, 5534.93] - - [1856, 4288, 1, 128] - - [424, 4400.11] + - [496, 4400.11] - - [704, 2368, 1, 1280] - - [449, 5735.02] + - [521, 5735.02] - - [1856, 2368, 1, 1280] - - [446, 6482.4] + - [518, 6482.4] - - [2368, 128, 1, 3328] - - [419, 4717.32] + - [491, 4717.32] - - [2944, 128, 1, 256] - - [446, 3276.9] + - [518, 3276.9] - - [448, 1408, 1, 256] - - [438, 4852.28] + - [510, 4852.28] - - [1856, 4288, 1, 1280] - - [433, 8132.96] + - [505, 8132.96] - - [64, 5056, 1, 3328] - - [409, 5097.06] + - [481, 5097.06] - - [4, 704, 1, 256] - - [456, 128.831] + - [528, 128.831] - - [1024, 448, 1, 128] - - [421, 1816.94] + - [493, 1816.94] - - [704, 4, 1, 1280] - - [457, 328.976] + - [529, 328.976] - - [704, 256, 1, 128] - - [425, 876.569] + - [497, 876.569] - - [704, 2944, 1, 128] - - [425, 3734.47] + - [497, 3734.47] - - [1408, 1024, 1, 1280] - - [433, 7224.85] + - [505, 7224.85] - - [704, 6784, 1, 256] - - [437, 7354.77] + - [509, 7354.77] - - [6784, 704, 1, 256] - - [433, 6012.28] + - [505, 6012.28] - - [5056, 1408, 1, 128] - - [426, 4311.28] + - [498, 4311.28] - - [2048, 7000, 1, 2048] - - [437, 7232.07] + - [509, 7232.07] - - [256, 3584, 1, 3328] - - [441, 7006.0] + - [513, 7006.0] - - [4, 5888, 1, 3328] - - [459, 534.612] + - [531, 534.612] - - [128, 1408, 1, 128] - - [351, 1177.07] + - [423, 1177.07] - - [3584, 4288, 1, 3328] - - [443, 7135.0] + - [515, 7135.0] - - [5888, 1856, 1, 1280] - - [431, 8395.03] + - [503, 8395.03] - - [256, 1408, 1, 256] - - [432, 3977.46] + - [504, 3977.46] - - [5056, 64, 1, 1280] - - [432, 4257.78] + - [504, 4257.78] - - [1024, 704, 1, 256] - - [432, 5036.93] + - [504, 5036.93] - - [448, 128, 1, 128] - - [353, 533.533] + - [425, 533.533] - - [2368, 3584, 1, 1280] - - [437, 8272.43] + - [509, 8272.43] - - [2368, 6784, 1, 1280] - - [430, 8288.24] + - [502, 8288.24] - - [1856, 4, 1, 1280] - - [370, 464.1] + - [442, 464.1] - - [448, 448, 1, 256] - - [432, 3058.45] + - [504, 3058.45] - - [2944, 3584, 1, 3328] - - [437, 8557.63] + - [509, 8557.63] - - [7680, 32, 1, 2560] - - [409, 3729.03] + - [481, 3729.03] - - [128, 4288, 1, 128] - - [352, 2116.2] + - [424, 2116.2] - - [256, 256, 1, 3328] - - [408, 4051.06] + - [480, 4051.06] - - [128, 1024, 1, 3328] - - [381, 5139.21] + - [453, 5139.21] - - [4, 1408, 1, 3328] - - [400, 502.871] + - [472, 502.871] - - [6784, 2944, 1, 256] - - [431, 8446.06] + - [503, 8446.06] - - [64, 1856, 1, 1280] - - [373, 3870.86] + - [445, 3870.86] - - [6784, 64, 1, 128] - - [421, 1877.62] + - [493, 1877.62] - - [4288, 2368, 1, 3328] - - [441, 8419.4] + - [513, 8419.4] - - [1856, 2368, 1, 256] - - [435, 6887.48] + - [507, 6887.48] - - [3584, 256, 1, 128] - - [425, 2496.71] + - [497, 2496.71] - - [3584, 6784, 1, 3328] - - [437, 7626.18] + - [509, 7626.18] - - [256, 1024, 1, 256] - - [438, 3095.53] + - [510, 3095.53] - - [4, 6784, 1, 3328] - - [400, 589.274] + - [472, 589.274] - - [1024, 5888, 1, 3328] - - [437, 7794.35] + - [509, 7794.35] - - [1024, 128, 1, 1280] - - [410, 3130.18] + - [482, 3130.18] - - [3072, 32, 1, 1024] - - [397, 1675.59] + - [469, 1675.59] - - [6144, 24000, 1, 2560] - - [437, 7256.14] + - [509, 7256.14] - - [5056, 4288, 1, 1280] - - [435, 8349.03] + - [507, 8349.03] - - [5888, 64, 1, 256] - - [384, 2593.35] + - [456, 2593.35] - - [6784, 1856, 1, 3328] - - [431, 8087.38] + - [503, 8087.38] - - [1408, 5056, 1, 1280] - - [433, 7802.63] + - [505, 7802.63] - - [1856, 256, 1, 1280] - - [438, 6150.73] + - [510, 6150.73] - - [64, 5888, 1, 3328] - - [405, 5301.49] + - [477, 5301.49] - - [2368, 2368, 1, 1280] - - [435, 8233.43] + - [507, 8233.43] - - [2944, 5888, 1, 128] - - [428, 3745.51] + - [500, 3745.51] - - [704, 5888, 1, 1280] - - [433, 8245.04] + - [505, 8245.04] - - [2368, 3584, 1, 128] - - [425, 4523.43] + - [497, 4523.43] - - [1856, 5056, 1, 128] - - [422, 4498.08] + - [494, 4498.08] - - [704, 1024, 1, 1280] - - [446, 5479.59] + - [518, 5479.59] - - [448, 256, 1, 3328] - - [389, 5048.8] + - [461, 5048.8] - - [448, 1856, 1, 128] - - [422, 2936.92] + - [494, 2936.92] - - [8192, 3200, 1, 2048] - - [431, 6713.12] + - [503, 6713.12] - - [128, 1024, 1, 128] - - [367, 998.744] + - [439, 998.744] - - [2944, 4, 1, 128] - - [452, 98.7471] + - [524, 98.7471] - - [1024, 704, 1, 1280] - - [438, 5897.0] + - [510, 5897.0] - - [128, 5888, 1, 256] - - [438, 5014.08] + - [510, 5014.08] - - [1024, 5056, 1, 1280] - - [437, 8857.81] + - [509, 8857.81] - - [4288, 1024, 1, 256] - - [443, 6195.39] + - [515, 6195.39] - - [2944, 2368, 1, 128] - - [421, 4442.23] + - [493, 4442.23] - - [704, 704, 1, 3328] - - [438, 6764.4] + - [510, 6764.4] - - [704, 1408, 1, 1280] - - [439, 7383.58] + - [511, 7383.58] - - [5888, 448, 1, 1280] - - [437, 7299.49] + - [509, 7299.49] - - [3584, 256, 1, 3328] - - [435, 7061.72] + - [507, 7061.72] - - [704, 5888, 1, 3328] - - [439, 8142.42] + - [511, 8142.42] - - [704, 1856, 1, 128] - - [425, 3139.14] + - [497, 3139.14] - - [448, 448, 1, 3328] - - [403, 5063.34] + - [475, 5063.34] - - [4, 4288, 1, 128] - - [453, 64.9775] + - [525, 64.9775] - - [128, 704, 1, 1280] - - [373, 3400.55] + - [445, 3400.55] - - [3584, 2944, 1, 256] - - [443, 7982.14] + - [515, 7982.14] - - [3584, 4, 1, 128] - - [452, 105.318] + - [524, 105.318] - - [1856, 128, 1, 3328] - - [404, 5442.19] + - [476, 5442.19] - - [4, 64, 1, 1280] - - [458, 42.3268] + - [530, 42.3268] - - [2944, 448, 1, 128] - - [421, 2926.95] + - [493, 2926.95] - - [128, 2944, 1, 1280] - - [432, 5109.69] + - [504, 5109.69] - - [64, 64, 1, 3328] - - [400, 1252.99] + - [472, 1252.99] - - [448, 2944, 1, 1280] - - [441, 6684.47] + - [513, 6684.47] - - [512, 24000, 1, 2048] - - [437, 7939.03] + - [509, 7939.03] - - [128, 256, 1, 3328] - - [418, 3276.9] + - [490, 3276.9] - - [1408, 5056, 1, 3328] - - [443, 8959.21] + - [515, 8959.21] - - [1856, 1856, 1, 3328] - - [433, 8006.17] + - [505, 8006.17] - - [3584, 128, 1, 256] - - [438, 4292.52] + - [510, 4292.52] - - [2560, 800, 1, 2560] - - [433, 6262.48] + - [505, 6262.48] - - [448, 1408, 1, 3328] - - [449, 4997.35] + - [521, 4997.35] - - [2368, 2368, 1, 256] - - [451, 4978.94] + - [523, 4978.94] - - [4288, 4288, 1, 1280] - - [430, 8617.78] + - [502, 8617.78] - - [64, 448, 1, 1280] - - [376, 2057.28] + - [448, 2057.28] - - [5888, 1024, 1, 1280] - - [448, 6848.17] + - [520, 6848.17] - - [1408, 4288, 1, 256] - - [431, 7077.01] + - [503, 7077.01] - - [448, 4, 1, 256] - - [456, 84.4294] + - [528, 84.4294] - - [5888, 448, 1, 128] - - [425, 3493.91] + - [497, 3493.91] - - [512, 48000, 1, 2560] - - [443, 8960.13] + - [515, 8960.13] - - [35, 8457, 1, 1760] - - [345, 3934.78] + - [417, 3934.78] - - [704, 6784, 1, 3328] - - [430, 8180.88] + - [502, 8180.88] - - [2560, 6400, 1, 2560] - - [431, 7822.24] + - [503, 7822.24] - - [5056, 1024, 1, 1280] - - [433, 8357.38] + - [505, 8357.38] - - [448, 5888, 1, 3328] - - [437, 7505.28] + - [509, 7505.28] - - [128, 4, 1, 128] - - [452, 0.662251] + - [524, 0.662251] - - [1024, 2944, 1, 1280] - - [437, 8406.24] + - [509, 8406.24] - - [5056, 5888, 1, 1280] - - [437, 8819.76] + - [509, 8819.76] - - [4288, 5888, 1, 128] - - [422, 3522.32] + - [494, 3522.32] - - [256, 3584, 1, 256] - - [433, 5883.89] + - [505, 5883.89] - - [1408, 3584, 1, 128] - - [421, 4283.41] + - [493, 4283.41] - - [256, 2944, 1, 3328] - - [441, 5670.63] + - [513, 5670.63] - - [448, 3584, 1, 128] - - [425, 3171.72] + - [497, 3171.72] - - [5888, 2944, 1, 1280] - - [443, 8198.86] + - [515, 8198.86] - - [4, 6784, 1, 1280] - - [390, 553.896] + - [462, 553.896] - - [2368, 5888, 1, 128] - - [421, 4787.32] + - [493, 4787.32] - - [8448, 16, 1, 2816] - - [380, 2452.63] + - [452, 2452.63] - - [64, 2944, 1, 128] - - [353, 1376.66] + - [425, 1376.66] - - [2368, 4, 1, 256] - - [375, 278.177] + - [447, 278.177] - - [3584, 5888, 1, 256] - - [451, 6233.66] + - [523, 6233.66] - - [2368, 1024, 1, 128] - - [422, 3781.51] + - [494, 3781.51] - - [2368, 704, 1, 128] - - [422, 3198.32] + - [494, 3198.32] - - [3584, 2944, 1, 1280] - - [433, 8045.68] + - [505, 8045.68] - - [3584, 2368, 1, 128] - - [422, 4188.57] + - [494, 4188.57] - - [5056, 704, 1, 128] - - [425, 4019.21] + - [497, 4019.21] - - [448, 2368, 1, 128] - - [427, 2522.21] + - [499, 2522.21] - - [5056, 1408, 1, 3328] - - [435, 8349.93] + - [507, 8349.93] - - [1408, 704, 1, 256] - - [441, 4741.42] + - [513, 4741.42] - - [6784, 1024, 1, 3328] - - [443, 8769.5] + - [515, 8769.5] - - [6784, 2944, 1, 3328] - - [440, 7319.74] + - [512, 7319.74] - - [2944, 5056, 1, 3328] - - [430, 8889.76] + - [502, 8889.76] - - [1856, 1856, 1, 256] - - [433, 6309.84] + - [505, 6309.84] - - [1024, 5888, 1, 128] - - [424, 3759.6] + - [496, 3759.6] - - [6784, 2368, 1, 1280] - - [433, 8298.4] + - [505, 8298.4] - - [256, 4, 1, 128] - - [452, 7.10171] + - [524, 7.10171] - - [4288, 5888, 1, 1280] - - [437, 8365.28] + - [509, 8365.28] - - [4288, 4288, 1, 256] - - [437, 6513.78] + - [509, 6513.78] - - [8448, 32, 1, 2816] - - [408, 4257.74] + - [480, 4257.74] - - [448, 2944, 1, 3328] - - [441, 6875.62] + - [513, 6875.62] - - [5888, 4, 1, 128] - - [452, 163.94] + - [524, 163.94] - - [4288, 1856, 1, 1280] - - [437, 8402.91] + - [509, 8402.91] - - [1856, 2944, 1, 3328] - - [437, 6612.21] + - [509, 6612.21] - - [256, 6784, 1, 3328] - - [438, 7358.7] + - [510, 7358.7] - - [64, 5888, 1, 256] - - [432, 3359.05] + - [504, 3359.05] - - [256, 5056, 1, 128] - - [425, 2489.21] + - [497, 2489.21] - - [5056, 1024, 1, 256] - - [443, 8077.87] + - [515, 8077.87] - - [704, 64, 1, 3328] - - [387, 3288.4] + - [459, 3288.4] - - [5056, 1856, 1, 3328] - - [441, 8171.13] + - [513, 8171.13] - - [4, 2944, 1, 3328] - - [400, 546.843] + - [472, 546.843] - - [4, 5056, 1, 256] - - [375, 378.561] + - [447, 378.561] - - [1856, 1408, 1, 256] - - [443, 6320.88] + - [515, 6320.88] - - [8448, 12000, 1, 2816] - - [441, 7365.87] + - [513, 7365.87] - - [6784, 128, 1, 3328] - - [438, 6366.57] + - [510, 6366.57] - - [4288, 1408, 1, 128] - - [421, 4451.7] + - [493, 4451.7] - - [1856, 5888, 1, 3328] - - [439, 8619.76] + - [511, 8619.76] - - [4288, 5056, 1, 256] - - [443, 7289.05] + - [515, 7289.05] - - [1408, 128, 1, 1280] - - [381, 4291.15] + - [453, 4291.15] - - [4096, 800, 1, 1024] - - [432, 5867.89] + - [504, 5867.89] - - [5056, 256, 1, 3328] - - [438, 7527.61] + - [510, 7527.61] - - [704, 704, 1, 256] - - [438, 4417.85] + - [510, 4417.85] - - [1024, 5888, 1, 1280] - - [443, 8674.57] + - [515, 8674.57] - - [6784, 2368, 1, 128] - - [421, 4724.08] + - [493, 4724.08] - - [4, 5056, 1, 1280] - - [390, 540.307] + - [462, 540.307] - - [256, 64, 1, 1280] - - [392, 1515.38] + - [464, 1515.38] - - [128, 1856, 1, 1280] - - [432, 4574.21] + - [504, 4574.21] - - [1856, 1024, 1, 1280] - - [437, 7741.61] + - [509, 7741.61] - - [6784, 4288, 1, 1280] - - [443, 8521.29] + - [515, 8521.29] - - [2560, 64, 1, 2560] - - [374, 3504.7] + - [446, 3504.7] - - [1856, 1856, 1, 1280] - - [433, 7779.31] + - [505, 7779.31] - - [4096, 400, 1, 1024] - - [443, 4157.81] + - [515, 4157.81] - - [3072, 24000, 1, 1024] - - [443, 8663.45] + - [515, 8663.45] - - [128, 4288, 1, 3328] - - [389, 5674.23] + - [461, 5674.23] - - [4, 2368, 1, 3328] - - [400, 525.48] + - [472, 525.48] - - [5888, 1856, 1, 128] - - [425, 4099.74] + - [497, 4099.74] - - [448, 704, 1, 1280] - - [438, 4309.47] + - [510, 4309.47] - - [128, 5056, 1, 1280] - - [381, 5068.46] + - [453, 5068.46] - - [1024, 448, 1, 3328] - - [441, 6077.82] + - [513, 6077.82] - - [1856, 704, 1, 1280] - - [449, 6257.49] + - [521, 6257.49] - - [5056, 3584, 1, 128] - - [422, 4598.52] + - [494, 4598.52] - - [5888, 5888, 1, 3328] - - [443, 8058.25] + - [515, 8058.25] - - [6784, 1024, 1, 256] - - [443, 5120.99] + - [515, 5120.99] - - [2944, 2368, 1, 256] - - [434, 6523.03] + - [506, 6523.03] - - [256, 448, 1, 256] - - [384, 1816.94] + - [456, 1816.94] - - [5056, 5888, 1, 3328] - - [436, 6722.41] + - [508, 6722.41] - - [1856, 1024, 1, 256] - - [443, 6632.31] + - [515, 6632.31] - - [512, 48000, 1, 1536] - - [437, 8556.01] + - [509, 8556.01] - - [3584, 448, 1, 1280] - - [432, 6567.09] + - [504, 6567.09] - - [8448, 5984, 1, 2816] - - [437, 8990.66] + - [509, 8990.66] - - [448, 5888, 1, 256] - - [437, 6220.47] + - [509, 6220.47] - - [704, 64, 1, 128] - - [350, 450.66] + - [422, 450.66] - - [1408, 6784, 1, 3328] - - [430, 8478.68] + - [502, 8478.68] - - [448, 1024, 1, 128] - - [429, 1844.33] + - [501, 1844.33] - - [4288, 704, 1, 128] - - [425, 3895.26] + - [497, 3895.26] - - [128, 1856, 1, 128] - - [356, 1456.46] + - [428, 1456.46] - - [448, 2368, 1, 3328] - - [435, 5538.04] + - [507, 5538.04] - - [5056, 64, 1, 128] - - [421, 1648.94] + - [493, 1648.94] - - [5056, 2944, 1, 256] - - [437, 8230.87] + - [509, 8230.87] - - [6784, 5888, 1, 128] - - [421, 4873.19] + - [493, 4873.19] - - [1024, 700, 1, 512] - - [435, 4445.37] + - [507, 4445.37] - - [704, 1024, 1, 256] - - [433, 4707.99] + - [505, 4707.99] - - [1024, 4, 1, 256] - - [375, 174.863] + - [447, 174.863] - - [2944, 704, 1, 128] - - [425, 3483.42] + - [497, 3483.42] - - [128, 6784, 1, 1280] - - [433, 6522.93] + - [505, 6522.93] - - [1408, 3584, 1, 3328] - - [437, 8673.59] + - [509, 8673.59] - - [2368, 6784, 1, 256] - - [433, 7941.76] + - [505, 7941.76] - - [5056, 1408, 1, 1280] - - [437, 8801.01] + - [509, 8801.01] - - [256, 256, 1, 128] - - [362, 551.982] + - [434, 551.982] - - [5056, 4288, 1, 128] - - [429, 3793.64] + - [501, 3793.64] - - [1408, 1856, 1, 128] - - [421, 3067.74] + - [493, 3067.74] - - [1408, 5888, 1, 3328] - - [437, 9148.97] + - [509, 9148.97] - - [1856, 256, 1, 256] - - [433, 4319.52] + - [505, 4319.52] - - [6784, 6784, 1, 256] - - [433, 7668.53] + - [505, 7668.53] - - [64, 256, 1, 128] - - [367, 131.172] + - [439, 131.172] - - [4288, 2368, 1, 128] - - [422, 4582.99] + - [494, 4582.99] - - [256, 4288, 1, 1280] - - [432, 6058.61] + - [504, 6058.61] - - [2368, 2944, 1, 256] - - [437, 8016.07] + - [509, 8016.07] - - [4, 1856, 1, 256] - - [454, 252.832] + - [526, 252.832] - - [3584, 1856, 1, 1280] - - [433, 7760.24] + - [505, 7760.24] - - [6784, 6784, 1, 128] - - [422, 4970.14] + - [494, 4970.14] - - [256, 1856, 1, 128] - - [428, 1580.59] + - [500, 1580.59] - - [704, 64, 1, 1280] - - [417, 2556.47] + - [489, 2556.47] - - [5888, 5056, 1, 256] - - [437, 8216.67] + - [509, 8216.67] - - [8448, 48000, 1, 2816] - - [443, 4082.89] + - [515, 4082.89] - - [3584, 448, 1, 256] - - [437, 5518.92] + - [509, 5518.92] - - [448, 4288, 1, 128] - - [425, 3415.25] + - [497, 3415.25] - - [7680, 64, 1, 2560] - - [386, 5162.1] + - [458, 5162.1] - - [256, 6784, 1, 256] - - [437, 6272.62] + - [509, 6272.62] - - [1408, 4288, 1, 128] - - [425, 4343.63] + - [497, 4343.63] - - [2944, 704, 1, 3328] - - [432, 7679.71] + - [504, 7679.71] - - [128, 448, 1, 256] - - [372, 1422.59] + - [444, 1422.59] - - [5056, 256, 1, 1280] - - [439, 5052.39] + - [511, 5052.39] - - [2560, 32, 1, 2560] - - [395, 3106.07] + - [467, 3106.07] - - [3584, 3584, 1, 256] - - [443, 8260.57] + - [515, 8260.57] - - [448, 1408, 1, 128] - - [421, 2397.38] + - [493, 2397.38] - - [128, 256, 1, 1280] - - [376, 2340.67] + - [448, 2340.67] - - [3584, 5056, 1, 256] - - [443, 7347.56] + - [515, 7347.56] - - [6784, 128, 1, 256] - - [433, 5591.1] + - [505, 5591.1] - - [4288, 4, 1, 256] - - [375, 354.206] + - [447, 354.206] - - [704, 448, 1, 256] - - [438, 3492.33] + - [510, 3492.33] - - [2944, 2368, 1, 1280] - - [445, 6661.71] + - [517, 6661.71] - - [448, 64, 1, 3328] - - [417, 3058.45] + - [489, 3058.45] - - [1408, 3584, 1, 256] - - [443, 7966.59] + - [515, 7966.59] - - [3584, 4, 1, 3328] - - [456, 605.559] + - [528, 605.559] - - [6784, 3584, 1, 256] - - [433, 7525.41] + - [505, 7525.41] - - [256, 128, 1, 128] - - [365, 276.041] + - [437, 276.041] - - [704, 1408, 1, 128] - - [422, 3109.85] + - [494, 3109.85] - - [4, 2368, 1, 256] - - [456, 283.375] + - [528, 283.375] - - [4288, 128, 1, 1280] - - [438, 5132.65] + - [510, 5132.65] - - [128, 1408, 1, 256] - - [432, 2733.35] + - [504, 2733.35] - - [4, 2944, 1, 256] - - [454, 314.127] + - [526, 314.127] - - [64, 128, 1, 3328] - - [402, 1514.71] + - [474, 1514.71] - - [5056, 2368, 1, 128] - - [426, 3449.17] + - [498, 3449.17] - - [2944, 2944, 1, 3328] - - [430, 8169.03] + - [502, 8169.03] - - [5056, 6784, 1, 256] - - [450, 5792.77] + - [522, 5792.77] - - [1856, 3584, 1, 128] - - [427, 4213.5] + - [499, 4213.5] - - [128, 2944, 1, 128] - - [351, 1970.46] + - [423, 1970.46] - - [35, 8457, 1, 2560] - - [346, 3525.15] + - [418, 3525.15] - - [1024, 704, 1, 3328] - - [432, 6784.99] + - [504, 6784.99] - - [6784, 448, 1, 256] - - [441, 6544.88] + - [513, 6544.88] - - [3584, 6784, 1, 128] - - [421, 4623.6] + - [493, 4623.6] - - [128, 4288, 1, 256] - - [435, 3606.6] + - [507, 3606.6] - - [704, 448, 1, 3328] - - [432, 4478.01] + - [504, 4478.01] - - [128, 128, 1, 3328] - - [417, 2177.65] + - [489, 2177.65] - - [5056, 1856, 1, 256] - - [451, 5608.72] + - [523, 5608.72] - - [4608, 5984, 1, 1536] - - [440, 7859.85] + - [512, 7859.85] - - [256, 128, 1, 256] - - [376, 998.744] + - [448, 998.744] - - [1760, 3200, 1, 1760] - - [433, 8179.64] + - [505, 8179.64] - - [1024, 1856, 1, 256] - - [443, 6143.27] + - [515, 6143.27] - - [4096, 1600, 1, 1024] - - [451, 5851.52] + - [523, 5851.52] - - [4288, 64, 1, 128] - - [356, 1372.26] + - [428, 1372.26] - - [256, 448, 1, 3328] - - [395, 4795.1] + - [467, 4795.1] - - [1408, 6784, 1, 1280] - - [437, 8426.5] + - [509, 8426.5] - - [3584, 3584, 1, 1280] - - [437, 7556.56] + - [509, 7556.56] - - [7680, 24000, 1, 2560] - - [430, 5019.19] + - [502, 5019.19] - - [64, 2368, 1, 1280] - - [381, 4061.8] + - [453, 4061.8] - - [448, 2368, 1, 1280] - - [432, 5928.77] + - [504, 5928.77] - - [4608, 48000, 1, 1536] - - [437, 6937.4] + - [509, 6937.4] - - [5888, 5888, 1, 128] - - [422, 3744.0] + - [494, 3744.0] - - [64, 6784, 1, 3328] - - [432, 5988.72] + - [504, 5988.72] - - [2944, 256, 1, 1280] - - [438, 6717.97] + - [510, 6717.97] - - [2048, 16, 1, 2048] - - [390, 1210.58] + - [462, 1210.58] - - [256, 2368, 1, 128] - - [425, 1936.07] + - [497, 1936.07] - - [5056, 2368, 1, 3328] - - [443, 8875.63] + - [515, 8875.63] - - [2944, 4288, 1, 256] - - [437, 8063.24] + - [509, 8063.24] - - [1408, 3584, 1, 1280] - - [433, 8197.07] + - [505, 8197.07] - - [2368, 64, 1, 256] - - [432, 2365.79] + - [504, 2365.79] - - [64, 448, 1, 3328] - - [418, 3027.4] + - [490, 3027.4] - - [704, 128, 1, 3328] - - [389, 4452.19] + - [461, 4452.19] - - [8192, 1600, 1, 2048] - - [437, 7229.93] + - [509, 7229.93] - - [1856, 704, 1, 256] - - [439, 5545.45] + - [511, 5545.45] - - [4, 4288, 1, 1280] - - [390, 523.825] + - [462, 523.825] - - [1408, 448, 1, 3328] - - [444, 4789.4] + - [516, 4789.4] - - [1024, 4, 1, 3328] - - [370, 504.223] + - [442, 504.223] - - [512, 24000, 1, 2560] - - [443, 8903.62] + - [515, 8903.62] - - [2368, 6784, 1, 3328] - - [443, 8311.14] + - [515, 8311.14] - - [1856, 1408, 1, 1280] - - [433, 8160.11] + - [505, 8160.11] - - [1856, 448, 1, 1280] - - [435, 6243.07] + - [507, 6243.07] - - [6784, 704, 1, 128] - - [421, 4069.05] + - [493, 4069.05] - - [4, 4, 1, 256] - - [390, 0.842029] + - [462, 0.842029] - - [128, 5888, 1, 128] - - [421, 2328.02] + - [493, 2328.02] - - [1408, 5888, 1, 256] - - [432, 6986.91] + - [504, 6986.91] - - [704, 2944, 1, 1280] - - [433, 7905.03] + - [505, 7905.03] - - [4288, 64, 1, 1280] - - [408, 3828.27] + - [480, 3828.27] - - [256, 64, 1, 256] - - [383, 655.46] + - [455, 655.46] - - [704, 1856, 1, 256] - - [441, 5444.37] + - [513, 5444.37] - - [704, 6784, 1, 128] - - [421, 4319.77] + - [493, 4319.77] - - [3584, 704, 1, 1280] - - [441, 7726.43] + - [513, 7726.43] - - [256, 128, 1, 1280] - - [376, 2184.63] + - [448, 2184.63] - - [5888, 2368, 1, 256] - - [443, 8192.69] + - [515, 8192.69] - - [256, 2368, 1, 1280] - - [438, 5675.54] + - [510, 5675.54] - - [2944, 6784, 1, 128] - - [426, 4248.35] + - [498, 4248.35] - - [3584, 448, 1, 3328] - - [437, 6560.77] + - [509, 6560.77] - - [1408, 4, 1, 256] - - [455, 176.79] + - [527, 176.79] - - [704, 2368, 1, 3328] - - [438, 7085.31] + - [510, 7085.31] - - [2944, 448, 1, 256] - - [434, 3412.0] + - [506, 3412.0] - - [1856, 448, 1, 128] - - [422, 2748.82] + - [494, 2748.82] - - [4288, 4, 1, 3328] - - [390, 553.648] + - [462, 553.648] - - [2368, 128, 1, 1280] - - [411, 4173.65] + - [483, 4173.65] - - [256, 5888, 1, 128] - - [426, 2860.98] + - [498, 2860.98] - - [64, 6784, 1, 256] - - [439, 3637.18] + - [511, 3637.18] - - [64, 5056, 1, 1280] - - [438, 4289.53] + - [510, 4289.53] - - [4, 6784, 1, 128] - - [452, 160.906] + - [524, 160.906] - - [2048, 3200, 1, 512] - - [439, 6927.09] + - [511, 6927.09] - - [2944, 2944, 1, 1280] - - [431, 6267.85] + - [503, 6267.85] - - [5056, 448, 1, 3328] - - [432, 7400.36] + - [504, 7400.36] - - [4, 3584, 1, 1280] - - [390, 499.83] + - [462, 499.83] - - [1408, 128, 1, 128] - - [367, 1037.36] + - [439, 1037.36] - - [6784, 704, 1, 3328] - - [438, 7633.95] + - [510, 7633.95] - - [128, 64, 1, 1280] - - [390, 1170.39] + - [462, 1170.39] - - [2368, 256, 1, 1280] - - [438, 5609.89] + - [510, 5609.89] - - [4, 448, 1, 3328] - - [458, 358.5] + - [530, 358.5] - - [5888, 4288, 1, 128] - - [426, 4521.74] + - [498, 4521.74] - - [4, 5888, 1, 256] - - [390, 353.933] + - [462, 353.933] - - [1408, 2944, 1, 3328] - - [431, 8951.41] + - [503, 8951.41] - - [3584, 704, 1, 128] - - [421, 3395.41] + - [493, 3395.41] - - [4608, 12000, 1, 1536] - - [430, 6609.99] + - [502, 6609.99] - - [64, 1024, 1, 256] - - [376, 1588.85] + - [448, 1588.85] - - [5056, 5056, 1, 128] - - [421, 4080.81] + - [493, 4080.81] - - [2368, 448, 1, 1280] - - [432, 5423.04] + - [504, 5423.04] - - [128, 3584, 1, 256] - - [438, 4705.25] + - [510, 4705.25] - - [704, 448, 1, 1280] - - [435, 3961.07] + - [507, 3961.07] - - [8192, 800, 1, 2048] - - [433, 6306.36] + - [505, 6306.36] - - [448, 5056, 1, 128] - - [425, 3709.56] + - [497, 3709.56] - - [256, 4, 1, 1280] - - [457, 163.94] + - [529, 163.94] - - [5056, 3584, 1, 256] - - [430, 7008.34] + - [502, 7008.34] - - [2368, 4, 1, 3328] - - [390, 496.366] + - [462, 496.366] - - [1408, 5056, 1, 128] - - [425, 4175.37] + - [497, 4175.37] - - [2944, 3584, 1, 128] - - [421, 4659.79] + - [493, 4659.79] - - [3584, 2368, 1, 256] - - [443, 5851.87] + - [515, 5851.87] - - [128, 3584, 1, 3328] - - [433, 6105.04] + - [505, 6105.04] - - [128, 1024, 1, 1280] - - [373, 3848.09] + - [445, 3848.09] - - [8448, 24000, 1, 2816] - - [443, 5128.64] + - [515, 5128.64] - - [64, 704, 1, 256] - - [376, 1253.83] + - [448, 1253.83] - - [4288, 256, 1, 1280] - - [432, 5625.86] + - [504, 5625.86] - - [3584, 3584, 1, 3328] - - [437, 8206.15] + - [509, 8206.15] - - [4, 704, 1, 128] - - [452, 29.5484] + - [524, 29.5484] - - [5888, 6784, 1, 256] - - [439, 8248.75] + - [511, 8248.75] - - [4288, 2944, 1, 3328] - - [437, 8657.12] + - [509, 8657.12] - - [2944, 64, 1, 128] - - [356, 1240.7] + - [428, 1240.7] - - [1024, 128, 1, 3328] - - [381, 4433.1] + - [453, 4433.1] - - [1024, 16, 1, 500000] - - [344, 2571.15] + - [416, 2571.15] - - [4288, 128, 1, 3328] - - [381, 5716.85] + - [453, 5716.85] - - [7680, 128, 1, 2560] - - [379, 5488.1] + - [451, 5488.1] - - [256, 5056, 1, 1280] - - [439, 6380.06] + - [511, 6380.06] - - [1408, 256, 1, 128] - - [425, 1633.83] + - [497, 1633.83] - - [2944, 5888, 1, 3328] - - [434, 7849.02] + - [506, 7849.02] - - [6784, 5888, 1, 1280] - - [443, 9047.72] + - [515, 9047.72] - - [2048, 800, 1, 512] - - [438, 4841.17] + - [510, 4841.17] - - [704, 128, 1, 256] - - [383, 1567.27] + - [455, 1567.27] - - [5888, 4288, 1, 1280] - - [437, 7982.93] + - [509, 7982.93] - - [1024, 24000, 1, 2048] - - [439, 5774.4] + - [511, 5774.4] - - [448, 256, 1, 1280] - - [373, 3707.19] + - [445, 3707.19] - - [5888, 3584, 1, 128] - - [426, 3804.5] + - [498, 3804.5] - - [1024, 2944, 1, 128] - - [421, 3308.36] + - [493, 3308.36] - - [5056, 4, 1, 1280] - - [454, 469.062] + - [526, 469.062] - - [256, 1408, 1, 1280] - - [432, 4899.99] + - [504, 4899.99] - - [3072, 16, 1, 1024] - - [390, 1233.72] + - [462, 1233.72] - - [704, 3584, 1, 128] - - [421, 3919.53] + - [493, 3919.53] - - [5888, 448, 1, 3328] - - [451, 6095.71] + - [523, 6095.71] - - [2368, 4288, 1, 1280] - - [433, 8338.4] + - [505, 8338.4] - - [4288, 2944, 1, 128] - - [425, 3946.6] + - [497, 3946.6] - - [1024, 6784, 1, 3328] - - [439, 7494.38] + - [511, 7494.38] - - [128, 2368, 1, 256] - - [438, 2895.42] + - [510, 2895.42] - - [6784, 64, 1, 3328] - - [432, 5964.99] + - [504, 5964.99] - - [5056, 2944, 1, 3328] - - [443, 6605.63] + - [515, 6605.63] - - [448, 128, 1, 256] - - [376, 1339.52] + - [448, 1339.52] - - [2944, 3584, 1, 256] - - [439, 7165.66] + - [511, 7165.66] - - [1408, 1408, 1, 3328] - - [443, 8332.96] + - [515, 8332.96] - - [1856, 128, 1, 1280] - - [438, 4498.43] + - [510, 4498.43] - - [3584, 3584, 1, 128] - - [422, 4000.11] + - [494, 4000.11] - - [64, 3584, 1, 256] - - [449, 2383.23] + - [521, 2383.23] - - [1408, 4, 1, 3328] - - [400, 423.008] + - [472, 423.008] - - [128, 2944, 1, 3328] - - [405, 5430.03] + - [477, 5430.03] - - [3584, 704, 1, 256] - - [438, 6154.09] + - [510, 6154.09] - - [2944, 448, 1, 3328] - - [438, 6507.82] + - [510, 6507.82] - - [3584, 1408, 1, 3328] - - [443, 8829.73] + - [515, 8829.73] - - [704, 3584, 1, 1280] - - [433, 7860.33] + - [505, 7860.33] - - [2944, 6784, 1, 1280] - - [443, 8894.6] + - [515, 8894.6] - - [1856, 6784, 1, 256] - - [443, 8115.19] + - [515, 8115.19] - - [4288, 448, 1, 3328] - - [435, 6397.35] + - [507, 6397.35] - - [6784, 4288, 1, 128] - - [421, 4109.54] + - [493, 4109.54] - - [6784, 704, 1, 1280] - - [431, 7999.14] + - [503, 7999.14] - - [256, 4288, 1, 256] - - [435, 4603.94] + - [507, 4603.94] - - [3584, 6784, 1, 256] - - [443, 7361.65] + - [515, 7361.65] - - [6144, 12000, 1, 2048] - - [442, 6311.76] + - [514, 6311.76] - - [6144, 16, 1, 2560] - - [391, 2240.65] + - [463, 2240.65] - - [3584, 64, 1, 128] - - [362, 1292.36] + - [434, 1292.36] - - [5888, 1024, 1, 3328] - - [430, 8394.59] + - [502, 8394.59] - - [448, 64, 1, 128] - - [353, 262.244] + - [425, 262.244] - - [704, 6784, 1, 1280] - - [437, 7740.66] + - [509, 7740.66] - - [4, 1024, 1, 1280] - - [390, 378.921] + - [462, 378.921] - - [5888, 128, 1, 256] - - [438, 5003.68] + - [510, 5003.68] - - [4096, 16, 1, 4096] - - [390, 1585.85] + - [462, 1585.85] - - [1856, 5056, 1, 3328] - - [431, 8522.92] + - [503, 8522.92] - - [4, 6784, 1, 256] - - [375, 387.757] + - [447, 387.757] - - [1024, 3584, 1, 128] - - [425, 3031.61] + - [497, 3031.61] - - [1024, 1408, 1, 128] - - [427, 2600.85] + - [499, 2600.85] - - [2368, 2944, 1, 128] - - [424, 4340.26] + - [496, 4340.26] - - [5056, 64, 1, 256] - - [438, 3109.62] + - [510, 3109.62] - - [4, 448, 1, 1280] - - [458, 253.835] + - [530, 253.835] - - [5056, 2944, 1, 128] - - [429, 3740.01] + - [501, 3740.01] - - [5888, 5056, 1, 3328] - - [443, 9016.48] + - [515, 9016.48] - - [1024, 704, 1, 128] - - [425, 2363.66] + - [497, 2363.66] - - [5888, 2368, 1, 128] - - [428, 3651.83] + - [500, 3651.83] - - [128, 5056, 1, 3328] - - [432, 6243.64] + - [504, 6243.64] - - [3584, 6784, 1, 1280] - - [430, 9080.67] + - [502, 9080.67] - - [448, 4, 1, 1280] - - [458, 243.083] + - [530, 243.083] - - [1856, 5888, 1, 256] - - [443, 8182.12] + - [515, 8182.12] - - [256, 256, 1, 256] - - [376, 1542.12] + - [448, 1542.12] - - [256, 64, 1, 128] - - [357, 135.226] + - [429, 135.226] - - [4288, 4288, 1, 3328] - - [443, 8674.64] + - [515, 8674.64] - - [4288, 1408, 1, 1280] - - [431, 7867.18] + - [503, 7867.18] - - [3584, 5056, 1, 128] - - [421, 4457.83] + - [493, 4457.83] - - [4, 1024, 1, 3328] - - [370, 440.394] + - [442, 440.394] - - [4288, 2368, 1, 256] - - [451, 5699.57] + - [523, 5699.57] - - [2944, 5056, 1, 1280] - - [443, 8236.56] + - [515, 8236.56] - - [448, 6784, 1, 256] - - [433, 6620.62] + - [505, 6620.62] - - [64, 128, 1, 128] - - [358, 67.6629] + - [430, 67.6629] - - [1856, 2368, 1, 128] - - [425, 4233.7] + - [497, 4233.7] - - [6784, 2368, 1, 3328] - - [443, 8269.9] + - [515, 8269.9] - - [256, 1024, 1, 1280] - - [432, 4882.88] + - [504, 4882.88] - - [704, 4, 1, 128] - - [452, 19.111] + - [524, 19.111] - - [256, 4, 1, 256] - - [390, 46.9114] + - [462, 46.9114] - - [4288, 128, 1, 256] - - [438, 4273.49] + - [510, 4273.49] - - [4288, 1856, 1, 3328] - - [433, 8195.81] + - [505, 8195.81] - - [3584, 448, 1, 128] - - [426, 2750.65] + - [498, 2750.65] - - [2048, 1600, 1, 2048] - - [449, 5753.59] + - [521, 5753.59] - - [256, 4, 1, 3328] - - [459, 297.978] + - [531, 297.978] - - [4, 1408, 1, 1280] - - [457, 402.386] + - [529, 402.386] - - [3584, 64, 1, 1280] - - [446, 4096.1] + - [518, 4096.1] - - [1408, 448, 1, 128] - - [421, 2498.25] + - [493, 2498.25] - - [3584, 1024, 1, 1280] - - [443, 7252.18] + - [515, 7252.18] - - [1856, 5056, 1, 256] - - [437, 7711.59] + - [509, 7711.59] - - [4, 3584, 1, 256] - - [454, 314.314] + - [526, 314.314] - - [4, 2944, 1, 1280] - - [390, 483.218] + - [462, 483.218] - - [1024, 4288, 1, 256] - - [442, 6544.52] + - [514, 6544.52] - - [5888, 3584, 1, 3328] - - [431, 8105.15] + - [503, 8105.15] - - [1856, 4, 1, 256] - - [390, 252.832] + - [462, 252.832] - - [4, 256, 1, 256] - - [375, 48.2882] + - [447, 48.2882] - - [5056, 3584, 1, 3328] - - [436, 7354.8] + - [508, 7354.8] - - [704, 448, 1, 128] - - [429, 1233.91] + - [501, 1233.91] - - [2368, 1408, 1, 1280] - - [437, 6654.24] + - [509, 6654.24] - - [5056, 2944, 1, 1280] - - [443, 8505.72] + - [515, 8505.72] - - [4, 4, 1, 128] - - [453, 0.1478505] + - [525, 0.1478505] - - [3584, 256, 1, 256] - - [435, 4616.47] + - [507, 4616.47] - - [1024, 6784, 1, 256] - - [437, 7944.98] + - [509, 7944.98] - - [4, 128, 1, 256] - - [390, 29.3571] + - [462, 29.3571] - - [64, 64, 1, 1280] - - [401, 642.61] + - [473, 642.61] - - [5124, 9124, 1, 2048] - - [443, 8019.4] + - [515, 8019.4] - - [6784, 4, 1, 128] - - [452, 193.067] + - [524, 193.067] - - [2944, 1408, 1, 128] - - [421, 3827.13] + - [493, 3827.13] - - [448, 128, 1, 3328] - - [394, 4064.0] + - [466, 4064.0] - - [3584, 1408, 1, 1280] - - [443, 7180.83] + - [515, 7180.83] - - [64, 4288, 1, 3328] - - [389, 4786.84] + - [461, 4786.84] - - [5056, 6784, 1, 3328] - - [430, 7889.83] + - [502, 7889.83] - - [128, 2944, 1, 256] - - [433, 3599.69] + - [505, 3599.69] - - [128, 6784, 1, 128] - - [351, 2606.79] + - [423, 2606.79] - - [3584, 4288, 1, 256] - - [437, 7299.81] + - [509, 7299.81] - - [448, 1856, 1, 256] - - [433, 5207.07] + - [505, 5207.07] - - [1856, 6784, 1, 3328] - - [435, 8386.36] + - [507, 8386.36] - - [3584, 128, 1, 3328] - - [379, 5590.04] + - [451, 5590.04] - - [64, 1856, 1, 256] - - [372, 1949.38] + - [444, 1949.38] - - [64, 448, 1, 256] - - [377, 955.833] + - [449, 955.833] - - [5888, 4288, 1, 256] - - [441, 7791.84] + - [513, 7791.84] - - [4, 448, 1, 128] - - [452, 8.84146] + - [524, 8.84146] - - [5056, 1408, 1, 256] - - [443, 5154.01] + - [515, 5154.01] - - [35, 8457, 1, 2048] - - [348, 3182.57] + - [420, 3182.57] - - [64, 256, 1, 1280] - - [397, 1713.46] + - [469, 1713.46] - - [3584, 1024, 1, 256] - - [433, 6528.18] + - [505, 6528.18] - - [256, 704, 1, 256] - - [432, 2720.46] + - [504, 2720.46] - - [5888, 5888, 1, 256] - - [441, 7992.26] + - [513, 7992.26] - - [4288, 1024, 1, 1280] - - [435, 7837.5] + - [507, 7837.5] - - [5888, 128, 1, 3328] - - [438, 7181.13] + - [510, 7181.13] - - [448, 6784, 1, 3328] - - [432, 7663.1] + - [504, 7663.1] - - [2944, 1408, 1, 1280] - - [441, 7903.14] + - [513, 7903.14] - - [64, 128, 1, 1280] - - [390, 1191.66] + - [462, 1191.66] - - [2944, 1856, 1, 3328] - - [431, 7844.41] + - [503, 7844.41] - - [2368, 64, 1, 128] - - [362, 997.973] + - [434, 997.973] - - [256, 1024, 1, 128] - - [421, 1215.84] + - [493, 1215.84] - - [3584, 5888, 1, 1280] - - [430, 8958.94] + - [502, 8958.94] - - [64, 4, 1, 128] - - [453, 1.21608] + - [525, 1.21608] - - [6784, 1856, 1, 1280] - - [430, 6728.8] + - [502, 6728.8] - - [2944, 5056, 1, 256] - - [443, 8275.21] + - [515, 8275.21] - - [4288, 4, 1, 128] - - [452, 147.644] + - [524, 147.644] - - [5888, 256, 1, 3328] - - [439, 7094.2] + - [511, 7094.2] - - [2944, 4288, 1, 128] - - [424, 4611.55] + - [496, 4611.55] - - [3584, 1408, 1, 256] - - [434, 6543.06] + - [506, 6543.06] - - [704, 3584, 1, 3328] - - [433, 8117.2] + - [505, 8117.2] - - [4096, 3200, 1, 1024] - - [448, 6656.13] + - [520, 6656.13] - - [5056, 448, 1, 1280] - - [446, 6096.2] + - [518, 6096.2] - - [3584, 1856, 1, 3328] - - [431, 8552.41] + - [503, 8552.41] - - [4288, 6784, 1, 1280] - - [437, 8212.46] + - [509, 8212.46] - - [2560, 7000, 1, 2560] - - [439, 7655.34] + - [511, 7655.34] - - [1408, 704, 1, 1280] - - [435, 5756.79] + - [507, 5756.79] - - [2944, 1024, 1, 256] - - [443, 6880.91] + - [515, 6880.91] - - [6784, 64, 1, 256] - - [438, 4438.96] + - [510, 4438.96] - - [2368, 4288, 1, 3328] - - [439, 8377.99] + - [511, 8377.99] - - [4, 1408, 1, 256] - - [456, 222.599] + - [528, 222.599] - - [1024, 1408, 1, 1280] - - [433, 6339.38] + - [505, 6339.38] - - [64, 64, 1, 256] - - [390, 187.346] + - [462, 187.346] - - [704, 256, 1, 3328] - - [432, 4046.14] + - [504, 4046.14] - - [6784, 5056, 1, 256] - - [443, 7972.17] + - [515, 7972.17] - - [1856, 1856, 1, 128] - - [427, 3716.61] + - [499, 3716.61] - - [3584, 5056, 1, 3328] - - [443, 8684.76] + - [515, 8684.76] - - [448, 6784, 1, 128] - - [425, 3829.05] + - [497, 3829.05] - - [4, 704, 1, 3328] - - [458, 393.206] + - [530, 393.206] - - [35, 8457, 1, 4096] - - [347, 3173.24] + - [419, 3173.24] - - [448, 2944, 1, 256] - - [441, 5553.41] + - [513, 5553.41] - - [4, 4288, 1, 3328] - - [400, 573.211] + - [472, 573.211] - - [2944, 6784, 1, 256] - - [437, 8566.06] + - [509, 8566.06] - - [2944, 2944, 1, 128] - - [421, 4540.83] + - [493, 4540.83] - - [4, 4, 1, 1280] - - [400, 3.14762] + - [472, 3.14762] - - [1856, 3584, 1, 1280] - - [437, 7306.36] + - [509, 7306.36] - - [64, 2944, 1, 256] - - [449, 2292.61] + - [521, 2292.61] - - [448, 256, 1, 128] - - [358, 797.93] + - [430, 797.93] - - [4288, 448, 1, 128] - - [424, 3430.5] + - [496, 3430.5] - - [4608, 24000, 1, 1536] - - [442, 6820.24] + - [514, 6820.24] - - [1856, 1408, 1, 3328] - - [445, 6600.24] + - [517, 6600.24] - - [128, 128, 1, 128] - - [350, 161.917] + - [422, 161.917] - - [1024, 4288, 1, 3328] - - [433, 7937.08] + - [505, 7937.08] - - [448, 2368, 1, 256] - - [441, 4526.45] + - [513, 4526.45] - - [1024, 4, 1, 128] - - [453, 16.9907] + - [525, 16.9907] - - [64, 1408, 1, 1280] - - [373, 3345.32] + - [445, 3345.32] - - [64, 6784, 1, 1280] - - [438, 5526.6] + - [510, 5526.6] - - [5056, 448, 1, 256] - - [432, 4216.65] + - [504, 4216.65] - - [2944, 2368, 1, 3328] - - [443, 7000.42] + - [515, 7000.42] - - [704, 4288, 1, 3328] - - [449, 6414.43] + - [521, 6414.43] - - [1408, 128, 1, 256] - - [432, 2720.46] + - [504, 2720.46] - - [1024, 1856, 1, 1280] - - [443, 7682.93] + - [515, 7682.93] - - [2048, 6400, 1, 2048] - - [439, 7418.22] + - [511, 7418.22] - - [512, 48000, 1, 2816] - - [443, 8884.77] + - [515, 8884.77] - - [5124, 9124, 1, 2560] - - [435, 6040.8] + - [507, 6040.8] - - [128, 2368, 1, 3328] - - [389, 5025.66] + - [461, 5025.66] - - [1024, 5888, 1, 256] - - [437, 7322.21] + - [509, 7322.21] - - [64, 2944, 1, 1280] - - [373, 4222.31] + - [445, 4222.31] - - [5056, 64, 1, 3328] - - [414, 4936.32] + - [486, 4936.32] - - [128, 704, 1, 128] - - [359, 683.414] + - [431, 683.414] - - [1408, 2368, 1, 256] - - [438, 6404.22] + - [510, 6404.22] - - [1408, 1408, 1, 256] - - [443, 4537.93] + - [515, 4537.93] - - [4, 64, 1, 128] - - [452, 2.56747] + - [524, 2.56747] - - [64, 1024, 1, 128] - - [351, 532.372] + - [423, 532.372] - - [1024, 8, 1, 500000] - - [341, 1685.08] + - [413, 1685.08] - - [2368, 2368, 1, 128] - - [422, 4334.33] + - [494, 4334.33] - - [64, 5888, 1, 128] - - [351, 2003.19] + - [423, 2003.19] - - [5888, 4, 1, 3328] - - [369, 339.118] + - [441, 339.118] - - [6784, 1408, 1, 128] - - [425, 4431.23] + - [497, 4431.23] - - [4288, 5888, 1, 256] - - [443, 7800.88] + - [515, 7800.88] - - [1408, 5056, 1, 256] - - [437, 8153.38] + - [509, 8153.38] - - [5056, 128, 1, 3328] - - [394, 5829.93] + - [466, 5829.93] - - [128, 128, 1, 1280] - - [397, 1691.35] + - [469, 1691.35] - - [448, 704, 1, 256] - - [438, 3364.28] + - [510, 3364.28] - - [4288, 3584, 1, 128] - - [422, 2952.68] + - [494, 2952.68] - - [2944, 128, 1, 3328] - - [394, 5620.82] + - [466, 5620.82] - - [64, 1408, 1, 3328] - - [395, 4169.91] + - [467, 4169.91] - - [3584, 5056, 1, 1280] - - [440, 7780.76] + - [512, 7780.76] - - [256, 448, 1, 1280] - - [373, 3929.45] + - [445, 3929.45] - - [704, 704, 1, 128] - - [421, 2346.17] + - [493, 2346.17] - - [5056, 4, 1, 128] - - [452, 144.557] + - [524, 144.557] - - [704, 256, 1, 1280] - - [441, 2283.22] + - [513, 2283.22] - - [64, 2368, 1, 3328] - - [373, 4921.69] + - [445, 4921.69] - - [1856, 1024, 1, 128] - - [422, 3459.57] + - [494, 3459.57] - - [1856, 64, 1, 128] - - [354, 918.237] + - [426, 918.237] - - [4096, 64, 1, 4096] - - [399, 4000.62] + - [471, 4000.62] - - [1024, 24000, 1, 1536] - - [435, 8502.36] + - [507, 8502.36] - - [704, 4288, 1, 256] - - [439, 6003.83] + - [511, 6003.83] - - [5888, 2368, 1, 1280] - - [430, 8801.3] + - [502, 8801.3] - - [128, 256, 1, 256] - - [384, 1070.08] + - [456, 1070.08] - - [64, 128, 1, 256] - - [390, 374.591] + - [462, 374.591] - - [2368, 5888, 1, 1280] - - [433, 8308.63] + - [505, 8308.63] - - [5888, 256, 1, 1280] - - [441, 7154.42] + - [513, 7154.42] - - [1760, 128, 1, 1760] - - [382, 5363.91] + - [454, 5363.91] - - [4, 5888, 1, 1280] - - [390, 542.304] + - [462, 542.304] - - [704, 128, 1, 128] - - [362, 779.447] + - [434, 779.447] - - [1024, 4, 1, 1280] - - [390, 392.531] + - [462, 392.531] - - [2368, 1856, 1, 3328] - - [433, 7975.32] + - [505, 7975.32] - - [2368, 128, 1, 128] - - [355, 1584.96] + - [427, 1584.96] - - [2944, 704, 1, 256] - - [441, 4039.21] + - [513, 4039.21] - - [5056, 128, 1, 128] - - [421, 2575.89] + - [493, 2575.89] - - [2368, 1024, 1, 3328] - - [449, 6165.54] + - [521, 6165.54] - - [256, 704, 1, 3328] - - [432, 4028.74] + - [504, 4028.74] - - [704, 3584, 1, 256] - - [443, 6102.92] + - [515, 6102.92] - - [704, 2944, 1, 3328] - - [433, 8202.84] + - [505, 8202.84] - - [6784, 1024, 1, 128] - - [425, 4386.4] + - [497, 4386.4] - - [256, 448, 1, 128] - - [362, 834.195] + - [434, 834.195] - - [448, 1024, 1, 3328] - - [450, 5412.48] + - [522, 5412.48] - - [2944, 1024, 1, 3328] - - [443, 6265.87] + - [515, 6265.87] - - [2944, 5056, 1, 128] - - [421, 4770.88] + - [493, 4770.88] - - [2368, 256, 1, 256] - - [438, 3975.23] + - [510, 3975.23] - - [1408, 6784, 1, 256] - - [437, 7987.02] + - [509, 7987.02] - - [6784, 1408, 1, 3328] - - [437, 8472.71] + - [509, 8472.71] - - [4288, 6784, 1, 128] - - [428, 3865.2] + - [500, 3865.2] - - [704, 64, 1, 256] - - [376, 1287.41] + - [448, 1287.41] - - [5888, 4, 1, 1280] - - [375, 510.022] + - [447, 510.022] - - [256, 2368, 1, 3328] - - [438, 5837.65] + - [510, 5837.65] - - [6784, 2944, 1, 1280] - - [443, 8560.54] + - [515, 8560.54] - - [4288, 1856, 1, 128] - - [421, 4617.07] + - [493, 4617.07] - - [1856, 2944, 1, 128] - - [421, 4287.73] + - [493, 4287.73] - - [6784, 448, 1, 128] - - [425, 3893.43] + - [497, 3893.43] - - [64, 3584, 1, 128] - - [351, 1609.76] + - [423, 1609.76] - - [448, 5056, 1, 1280] - - [441, 7124.41] + - [513, 7124.41] - - [2368, 1856, 1, 128] - - [424, 4004.65] + - [496, 4004.65] - - [64, 2944, 1, 3328] - - [374, 5086.48] + - [446, 5086.48] - - [4288, 704, 1, 256] - - [439, 6176.57] + - [511, 6176.57] - - [256, 3584, 1, 128] - - [422, 2553.15] + - [494, 2553.15] - - [5888, 704, 1, 256] - - [438, 6781.51] + - [510, 6781.51] - - [3584, 1024, 1, 128] - - [425, 3660.95] + - [497, 3660.95] - - [256, 5888, 1, 3328] - - [441, 7772.13] + - [513, 7772.13] - - [1408, 4288, 1, 3328] - - [437, 8832.86] + - [509, 8832.86] - - [6784, 4288, 1, 256] - - [443, 8566.14] + - [515, 8566.14] - - [4288, 256, 1, 128] - - [423, 1953.79] + - [495, 1953.79] - - [5888, 256, 1, 256] - - [441, 3730.53] + - [513, 3730.53] - - [6784, 1024, 1, 1280] - - [437, 8578.39] + - [509, 8578.39] - - [5888, 1024, 1, 128] - - [422, 4092.96] + - [494, 4092.96] - - [1024, 128, 1, 256] - - [372, 1897.98] + - [444, 1897.98] - - [512, 16, 1, 500000] - - [343, 2363.79] + - [415, 2363.79] - - [128, 64, 1, 3328] - - [400, 1592.56] + - [472, 1592.56] - - [448, 64, 1, 256] - - [390, 976.168] + - [462, 976.168] - - [2368, 256, 1, 128] - - [425, 2094.99] + - [497, 2094.99] - - [6784, 3584, 1, 1280] - - [437, 8570.16] + - [509, 8570.16] - - [1024, 6784, 1, 1280] - - [443, 8203.57] + - [515, 8203.57] - - [2944, 64, 1, 1280] - - [381, 4300.61] + - [453, 4300.61] - - [1408, 2944, 1, 1280] - - [433, 7349.64] + - [505, 7349.64] - - [256, 1856, 1, 256] - - [432, 4649.75] + - [504, 4649.75] - - [2048, 800, 1, 2048] - - [451, 4668.73] + - [523, 4668.73] - - [1408, 2368, 1, 3328] - - [441, 7537.74] + - [513, 7537.74] - - [2944, 4, 1, 3328] - - [390, 514.142] + - [462, 514.142] - - [128, 1408, 1, 3328] - - [382, 4991.64] + - [454, 4991.64] - - [2944, 1856, 1, 128] - - [421, 4317.39] + - [493, 4317.39] - - [256, 2944, 1, 128] - - [421, 2258.27] + - [493, 2258.27] - - [256, 6784, 1, 128] - - [421, 3147.02] + - [493, 3147.02] - - [2368, 4, 1, 128] - - [453, 33.9286] + - [525, 33.9286] - - [1408, 256, 1, 3328] - - [432, 5077.85] + - [504, 5077.85] - - [1856, 4, 1, 128] - - [453, 21.5025] + - [525, 21.5025] - - [5056, 6784, 1, 128] - - [421, 4945.11] + - [493, 4945.11] - - [4288, 5056, 1, 128] - - [424, 4729.87] + - [496, 4729.87] - - [1856, 5888, 1, 128] - - [421, 4707.96] + - [493, 4707.96] - - [2944, 5888, 1, 256] - - [435, 8014.78] + - [507, 8014.78] - - [3584, 1856, 1, 256] - - [437, 7567.13] + - [509, 7567.13] - - [4288, 3584, 1, 1280] - - [430, 8726.43] + - [502, 8726.43] - - [2368, 448, 1, 256] - - [438, 4227.7] + - [510, 4227.7] - - [4288, 256, 1, 3328] - - [439, 5487.41] + - [511, 5487.41] - - [1856, 704, 1, 128] - - [425, 3125.06] + - [497, 3125.06] - - [1408, 64, 1, 256] - - [385, 1620.09] + - [457, 1620.09] - - [64, 1856, 1, 128] - - [349, 955.147] + - [421, 955.147] - - [4, 256, 1, 128] - - [452, 10.8789] + - [524, 10.8789] - - [2560, 16, 1, 2560] - - [397, 2019.7] + - [469, 2019.7] - - [704, 5888, 1, 128] - - [426, 3976.26] + - [498, 3976.26] - - [6784, 3584, 1, 128] - - [425, 4018.91] + - [497, 4018.91] - - [1024, 64, 1, 256] - - [390, 1370.79] + - [462, 1370.79] - - [64, 2368, 1, 256] - - [432, 2255.76] + - [504, 2255.76] - - [4288, 5056, 1, 3328] - - [437, 8368.69] + - [509, 8368.69] - - [4, 1856, 1, 1280] - - [390, 392.126] + - [462, 392.126] - - [4288, 128, 1, 128] - - [355, 2287.03] + - [427, 2287.03] - - [1408, 1408, 1, 128] - - [425, 3233.48] + - [497, 3233.48] - - [7680, 16, 1, 2560] - - [393, 2257.37] + - [465, 2257.37] - - [1856, 128, 1, 128] - - [355, 1532.8] + - [427, 1532.8] - - [5056, 2368, 1, 256] - - [437, 8167.29] + - [509, 8167.29] - - [4288, 704, 1, 3328] - - [443, 6411.16] + - [515, 6411.16] - - [448, 3584, 1, 256] - - [443, 5477.74] + - [515, 5477.74] - - [2368, 64, 1, 1280] - - [373, 3936.52] + - [445, 3936.52] - - [2368, 1024, 1, 1280] - - [439, 7688.82] + - [511, 7688.82] - - [2944, 1408, 1, 3328] - - [430, 7668.78] + - [502, 7668.78] - - [1408, 448, 1, 256] - - [432, 4863.98] + - [504, 4863.98] - - [1024, 1408, 1, 3328] - - [441, 7448.99] + - [513, 7448.99] - - [2944, 5888, 1, 1280] - - [431, 8208.57] + - [503, 8208.57] - - [1408, 4, 1, 1280] - - [370, 479.419] + - [442, 479.419] - - [5888, 3584, 1, 256] - - [431, 8610.09] + - [503, 8610.09] - - [2368, 5056, 1, 128] - - [428, 3726.25] + - [500, 3726.25] - - [1408, 1856, 1, 3328] - - [432, 7829.48] + - [504, 7829.48] - - [4, 4, 1, 3328] - - [459, 4.39419] + - [531, 4.39419] - - [6784, 1408, 1, 1280] - - [432, 7690.8] + - [504, 7690.8] - - [4096, 7000, 1, 4096] - - [444, 6272.49] + - [516, 6272.49] - - [704, 2944, 1, 256] - - [433, 6095.91] + - [505, 6095.91] - - [4288, 64, 1, 256] - - [398, 2121.31] + - [470, 2121.31] - - [6784, 5888, 1, 3328] - - [437, 8955.6] + - [509, 8955.6] - - [2368, 4288, 1, 128] - - [421, 4699.65] + - [493, 4699.65] - - [64, 4288, 1, 1280] - - [411, 4013.73] + - [483, 4013.73] - - [6784, 64, 1, 1280] - - [432, 5418.83] + - [504, 5418.83] - - [3584, 128, 1, 128] - - [361, 2165.3] + - [433, 2165.3] - - [1024, 6784, 1, 128] - - [422, 3765.3] + - [494, 3765.3] - - [4, 1856, 1, 128] - - [453, 33.3728] + - [525, 33.3728] - - [1408, 64, 1, 3328] - - [394, 4489.51] + - [466, 4489.51] - - [6784, 4, 1, 256] - - [390, 400.262] + - [462, 400.262] - - [1408, 1408, 1, 1280] - - [437, 8139.53] + - [509, 8139.53] - - [16384, 400, 1, 4096] - - [441, 6087.28] + - [513, 6087.28] - - [256, 2368, 1, 256] - - [432, 4766.35] + - [504, 4766.35] - - [448, 4288, 1, 3328] - - [439, 7577.08] + - [511, 7577.08] - - [2368, 1408, 1, 256] - - [435, 5284.53] + - [507, 5284.53] - - [5888, 5056, 1, 128] - - [422, 3643.6] + - [494, 3643.6] - - [704, 2368, 1, 256] - - [437, 5334.73] + - [509, 5334.73] - - [1024, 24000, 1, 2560] - - [445, 7438.06] + - [517, 7438.06] - - [2944, 448, 1, 1280] - - [446, 4937.53] + - [518, 4937.53] - - [5888, 2368, 1, 3328] - - [431, 8201.84] + - [503, 8201.84] - - [5124, 9124, 1, 1760] - - [438, 6764.06] + - [510, 6764.06] - - [448, 1408, 1, 1280] - - [432, 5881.54] + - [504, 5881.54] - - [448, 1856, 1, 1280] - - [439, 6225.56] + - [511, 6225.56] - - [4288, 448, 1, 1280] - - [441, 5626.37] + - [513, 5626.37] - - [5888, 704, 1, 3328] - - [435, 7873.62] + - [507, 7873.62] - - [5056, 256, 1, 128] - - [426, 2921.03] + - [498, 2921.03] - - [1856, 256, 1, 128] - - [428, 1995.42] + - [500, 1995.42] - - [64, 1408, 1, 128] - - [349, 758.938] + - [421, 758.938] - - [704, 4, 1, 256] - - [390, 130.697] + - [462, 130.697] - - [1408, 5888, 1, 128] - - [421, 4574.05] + - [493, 4574.05] - - [7680, 12000, 1, 2560] - - [437, 8747.13] + - [509, 8747.13] - - [1408, 1024, 1, 256] - - [434, 4609.23] + - [506, 4609.23] - - [8192, 400, 1, 2048] - - [446, 5283.25] + - [518, 5283.25] - - [1024, 1856, 1, 128] - - [421, 2686.38] + - [493, 2686.38] - - [256, 704, 1, 128] - - [421, 1004.83] + - [493, 1004.83] - - [2560, 128, 1, 2560] - - [399, 4259.14] + - [471, 4259.14] - - [448, 1024, 1, 256] - - [432, 4813.24] + - [504, 4813.24] - - [128, 4, 1, 3328] - - [458, 128.408] + - [530, 128.408] - - [5056, 6784, 1, 1280] - - [440, 6579.85] + - [512, 6579.85] - - [1408, 64, 1, 128] - - [362, 819.3] + - [434, 819.3] - - [1024, 448, 1, 1280] - - [441, 5703.31] + - [513, 5703.31] - - [704, 5056, 1, 3328] - - [433, 7574.49] + - [505, 7574.49] - - [128, 5056, 1, 256] - - [432, 5113.53] + - [504, 5113.53] - - [64, 1024, 1, 3328] - - [417, 3980.1] + - [489, 3980.1] - - [1856, 4, 1, 3328] - - [371, 433.253] + - [443, 433.253] - - [4, 2944, 1, 128] - - [453, 46.6225] + - [525, 46.6225] - - [2368, 2944, 1, 3328] - - [431, 9002.13] + - [503, 9002.13] - - [448, 448, 1, 1280] - - [373, 3969.52] + - [445, 3969.52] - - [2368, 3584, 1, 256] - - [443, 7806.39] + - [515, 7806.39] - - [5056, 3584, 1, 1280] - - [430, 8971.56] + - [502, 8971.56] - - [5124, 9124, 1, 4096] - - [443, 7208.72] + - [515, 7208.72] - - [7680, 48000, 1, 2560] - - [437, 3835.91] + - [509, 3835.91] - - [448, 4, 1, 3328] - - [458, 409.7] + - [530, 409.7] - - [1856, 2944, 1, 1280] - - [430, 7173.71] + - [502, 7173.71] - - [1024, 48000, 1, 2816] - - [437, 8976.26] + - [509, 8976.26] - - [128, 1024, 1, 256] - - [376, 1969.26] + - [448, 1969.26] - - [2944, 1408, 1, 256] - - [439, 4585.12] + - [511, 4585.12] - - [4288, 1408, 1, 3328] - - [433, 8237.27] + - [505, 8237.27] - - [3584, 64, 1, 3328] - - [379, 5183.16] + - [451, 5183.16] - - [5888, 2944, 1, 128] - - [428, 3674.56] + - [500, 3674.56] - - [2944, 1024, 1, 128] - - [425, 3834.32] + - [497, 3834.32] - - [4288, 5056, 1, 1280] - - [437, 8086.1] + - [509, 8086.1] - - [5888, 6784, 1, 1280] - - [431, 6941.32] + - [503, 6941.32] - - [6784, 5056, 1, 128] - - [422, 4860.15] + - [494, 4860.15] - - [256, 1024, 1, 3328] - - [446, 5156.22] + - [518, 5156.22] - - [3584, 4, 1, 256] - - [390, 332.529] + - [462, 332.529] - - [1760, 1600, 1, 1760] - - [433, 6330.76] + - [505, 6330.76] - - [1856, 64, 1, 3328] - - [394, 4756.03] + - [466, 4756.03] - - [4, 128, 1, 3328] - - [458, 160.244] + - [530, 160.244] - - [5888, 1408, 1, 3328] - - [431, 8722.74] + - [503, 8722.74] - - [448, 2944, 1, 128] - - [424, 2997.63] + - [496, 2997.63] - - [2368, 1856, 1, 256] - - [432, 6662.34] + - [504, 6662.34] - - [256, 5056, 1, 256] - - [434, 5256.29] + - [506, 5256.29] - - [128, 3584, 1, 128] - - [353, 2073.56] + - [425, 2073.56] - - [448, 3584, 1, 3328] - - [430, 6833.96] + - [502, 6833.96] - - [4, 5056, 1, 3328] - - [400, 581.523] + - [472, 581.523] - - [704, 2368, 1, 128] - - [421, 3402.29] + - [493, 3402.29] - - [5888, 256, 1, 128] - - [426, 2977.54] + - [498, 2977.54] - - [4, 5056, 1, 128] - - [452, 65.2074] + - [524, 65.2074] - - [448, 256, 1, 256] - - [438, 1764.53] + - [510, 1764.53] - - [704, 4, 1, 3328] - - [390, 398.554] + - [462, 398.554] - - [1408, 256, 1, 256] - - [433, 3463.86] + - [505, 3463.86] - - [3584, 1856, 1, 128] - - [429, 3228.19] + - [501, 3228.19] - - [4288, 4288, 1, 128] - - [425, 4853.93] + - [497, 4853.93] - - [1856, 1024, 1, 3328] - - [449, 5994.68] + - [521, 5994.68] - - [128, 5888, 1, 3328] - - [403, 6512.85] + - [475, 6512.85] - - [1024, 5056, 1, 256] - - [443, 7859.42] + - [515, 7859.42] - - [5888, 5888, 1, 1280] - - [443, 8131.44] + - [515, 8131.44] - - [5056, 5888, 1, 128] - - [422, 4920.71] + - [494, 4920.71] - - [2368, 1408, 1, 3328] - - [441, 7110.74] + - [513, 7110.74] - - [1024, 48000, 1, 1536] - - [441, 8590.82] + - [513, 8590.82] - - [5888, 448, 1, 256] - - [442, 3567.74] + - [514, 3567.74] - - [2560, 3200, 1, 2560] - - [432, 7638.31] + - [504, 7638.31] - - [5888, 6784, 1, 128] - - [422, 3910.92] + - [494, 3910.92] - - [6144, 48000, 1, 2048] - - [443, 3412.95] + - [515, 3412.95] - - [6784, 5056, 1, 1280] - - [434, 7890.22] + - [506, 7890.22] - - [5056, 704, 1, 1280] - - [438, 7665.06] + - [510, 7665.06] - - [1024, 48000, 1, 2560] - - [443, 8188.5] + - [515, 8188.5] - - [4608, 32, 1, 1536] - - [411, 2856.97] + - [483, 2856.97] - - [1024, 2368, 1, 128] - - [421, 3019.35] + - [493, 3019.35] - - [128, 704, 1, 256] - - [372, 1696.33] + - [444, 1696.33] - - [2368, 448, 1, 3328] - - [438, 5799.29] + - [510, 5799.29] - - [128, 5888, 1, 1280] - - [432, 6680.75] + - [504, 6680.75] - - [16384, 800, 1, 4096] - - [437, 6322.22] + - [509, 6322.22] - - [448, 128, 1, 1280] - - [411, 2849.49] + - [483, 2849.49] - - [6784, 4, 1, 3328] - - [390, 563.12] + - [462, 563.12] - - [5888, 5056, 1, 1280] - - [437, 8631.33] + - [509, 8631.33] - - [1024, 64, 1, 3328] - - [412, 3481.96] + - [484, 3481.96] - - [3072, 48000, 1, 1024] - - [437, 9019.49] + - [509, 9019.49] - - [64, 3584, 1, 1280] - - [374, 4327.95] + - [446, 4327.95] - - [6784, 1408, 1, 256] - - [437, 6320.59] + - [509, 6320.59] - - [3584, 5888, 1, 128] - - [424, 4406.79] + - [496, 4406.79] - - [5056, 5888, 1, 256] - - [443, 8037.13] + - [515, 8037.13] - - [2368, 1024, 1, 256] - - [435, 4936.14] + - [507, 4936.14] - - [2944, 1856, 1, 256] - - [443, 7222.32] + - [515, 7222.32] - - [1856, 6784, 1, 1280] - - [433, 8251.81] + - [505, 8251.81] - - [64, 5056, 1, 128] - - [353, 1643.7] + - [425, 1643.7] - - [64, 6784, 1, 128] - - [351, 1929.77] + - [423, 1929.77] - - [448, 704, 1, 128] - - [423, 979.959] + - [495, 979.959] - - [4, 1024, 1, 128] - - [452, 20.1416] + - [524, 20.1416] - - [4288, 3584, 1, 256] - - [437, 8444.14] + - [509, 8444.14] - - [1408, 704, 1, 128] - - [421, 3021.0] + - [493, 3021.0] - - [64, 256, 1, 3328] - - [417, 2227.47] + - [489, 2227.47] - - [6784, 448, 1, 3328] - - [443, 6573.11] + - [515, 6573.11] - - [5056, 1856, 1, 1280] - - [435, 7976.23] + - [507, 7976.23] - - [1408, 1024, 1, 3328] - - [433, 7470.33] + - [505, 7470.33] - - [2368, 256, 1, 3328] - - [438, 5394.37] + - [510, 5394.37] - - [5888, 3584, 1, 1280] - - [430, 9031.55] + - [502, 9031.55] - - [1856, 3584, 1, 3328] - - [445, 7272.6] + - [517, 7272.6] - - [5888, 128, 1, 1280] - - [438, 6684.48] + - [510, 6684.48] - - [1024, 2944, 1, 256] - - [443, 7415.09] + - [515, 7415.09] - - [448, 6784, 1, 1280] - - [439, 7923.78] + - [511, 7923.78] - - [256, 3584, 1, 1280] - - [435, 6901.87] + - [507, 6901.87] - - [704, 5056, 1, 256] - - [440, 5004.55] + - [512, 5004.55] - - [3584, 1024, 1, 3328] - - [432, 7894.63] + - [504, 7894.63] - - [2944, 1856, 1, 1280] - - [437, 7903.27] + - [509, 7903.27] - - [128, 256, 1, 128] - - [350, 325.745] + - [422, 325.745] - - [5056, 256, 1, 256] - - [434, 3356.56] + - [506, 3356.56] - - [2944, 4288, 1, 3328] - - [443, 7813.93] + - [515, 7813.93] - - [2368, 3584, 1, 3328] - - [443, 8371.09] + - [515, 8371.09] - - [2944, 704, 1, 1280] - - [449, 5514.09] + - [521, 5514.09] - - [128, 4, 1, 256] - - [390, 25.3062] + - [462, 25.3062] - - [2944, 3584, 1, 1280] - - [437, 7738.83] + - [509, 7738.83] - - [1856, 5888, 1, 1280] - - [431, 8584.63] + - [503, 8584.63] - - [256, 256, 1, 1280] - - [411, 2962.18] + - [483, 2962.18] - - [2048, 3200, 1, 2048] - - [439, 6911.69] + - [511, 6911.69] - - [4288, 1408, 1, 256] - - [437, 7954.0] + - [509, 7954.0] - - [3584, 64, 1, 256] - - [438, 2780.42] + - [510, 2780.42] - - [64, 1856, 1, 3328] - - [373, 4912.04] + - [445, 4912.04] - - [256, 1408, 1, 128] - - [421, 1373.24] + - [493, 1373.24] - - [5888, 1408, 1, 128] - - [426, 4242.01] + - [498, 4242.01] - - [4288, 2368, 1, 1280] - - [435, 8012.7] + - [507, 8012.7] - - [4, 4288, 1, 256] - - [456, 301.674] + - [528, 301.674] - - [256, 4288, 1, 128] - - [421, 2706.36] + - [493, 2706.36] - - [2048, 128, 1, 2048] - - [416, 2885.26] + - [488, 2885.26] - - [256, 128, 1, 3328] - - [418, 3170.21] + - [490, 3170.21] - - [512, 8, 1, 500000] - - [342, 1915.12] + - [414, 1915.12] - - [6784, 2368, 1, 256] - - [437, 8323.66] + - [509, 8323.66] - - [5888, 128, 1, 128] - - [425, 2466.08] + - [497, 2466.08] - - [1024, 24000, 1, 2816] - - [435, 8131.64] + - [507, 8131.64] - - [7680, 5984, 1, 2560] - - [439, 6040.77] + - [511, 6040.77] - - [4288, 1856, 1, 256] - - [451, 5818.53] + - [523, 5818.53] - - [1856, 256, 1, 3328] - - [432, 6532.03] + - [504, 6532.03] - - [1856, 2944, 1, 256] - - [437, 7312.92] + - [509, 7312.92] - - [5056, 1024, 1, 128] - - [427, 4103.0] + - [499, 4103.0] - - [64, 5888, 1, 1280] - - [432, 5058.25] + - [504, 5058.25] - - [1760, 800, 1, 1760] - - [435, 7280.0] + - [507, 7280.0] - - [6784, 256, 1, 128] - - [425, 3257.69] + - [497, 3257.69] - - [5888, 704, 1, 128] - - [421, 3813.93] + - [493, 3813.93] - - [1408, 2368, 1, 128] - - [422, 3561.27] + - [494, 3561.27] - - [1024, 4288, 1, 1280] - - [441, 7752.74] + - [513, 7752.74] - - [2368, 5056, 1, 3328] - - [444, 7711.91] + - [516, 7711.91] - - [448, 4, 1, 128] - - [452, 18.4795] + - [524, 18.4795] - - [4, 256, 1, 3328] - - [459, 269.71] + - [531, 269.71] - - [4288, 1024, 1, 3328] - - [438, 7910.27] + - [510, 7910.27] - - [6144, 48000, 1, 2560] - - [437, 3541.09] + - [509, 3541.09] - - [1024, 5056, 1, 3328] - - [431, 8509.66] + - [503, 8509.66] - - [1024, 1856, 1, 3328] - - [437, 7907.93] + - [509, 7907.93] - - [704, 704, 1, 1280] - - [449, 5648.15] + - [521, 5648.15] - - [128, 2368, 1, 1280] - - [408, 4145.11] + - [480, 4145.11] - - [1408, 128, 1, 3328] - - [381, 4919.6] + - [453, 4919.6] - - [3584, 256, 1, 1280] - - [433, 5185.56] + - [505, 5185.56] - - [4, 128, 1, 128] - - [452, 3.07891] + - [524, 3.07891] - - [5888, 64, 1, 1280] - - [381, 4499.59] + - [453, 4499.59] - - [3584, 128, 1, 1280] - - [438, 5929.01] + - [510, 5929.01] - - [4, 256, 1, 1280] - - [457, 170.767] + - [529, 170.767] - - [128, 704, 1, 3328] - - [381, 4379.37] + - [453, 4379.37] - - [4288, 6784, 1, 256] - - [431, 7181.09] + - [503, 7181.09] - - [3584, 2944, 1, 3328] - - [437, 8553.3] + - [509, 8553.3] - - [128, 1856, 1, 256] - - [438, 3207.77] + - [510, 3207.77] - - [64, 4288, 1, 256] - - [432, 2907.99] + - [504, 2907.99] - - [4, 3584, 1, 3328] - - [390, 560.605] + - [462, 560.605] - - [64, 4, 1, 3328] - - [459, 67.5025] + - [531, 67.5025] - - [4, 64, 1, 3328] - - [459, 88.8467] + - [531, 88.8467] - - [5888, 2944, 1, 256] - - [437, 7255.77] + - [509, 7255.77] - - [1856, 64, 1, 256] - - [383, 1743.72] + - [455, 1743.72] - - [5056, 128, 1, 1280] - - [438, 6009.79] + - [510, 6009.79] - - [448, 4288, 1, 1280] - - [439, 6466.82] + - [511, 6466.82] - - [448, 1856, 1, 3328] - - [439, 6381.99] + - [511, 6381.99] - - [1024, 4288, 1, 128] - - [424, 3491.87] + - [496, 3491.87] - - [4, 1024, 1, 256] - - [457, 172.563] + - [529, 172.563] - - [5056, 4288, 1, 256] - - [437, 8241.52] + - [509, 8241.52] - - [1024, 448, 1, 256] - - [441, 4218.51] + - [513, 4218.51] - - [1024, 3584, 1, 256] - - [437, 6513.69] + - [509, 6513.69] - - [2944, 128, 1, 1280] - - [381, 4710.48] + - [453, 4710.48] - - [2048, 32, 1, 2048] - - [396, 1779.23] + - [468, 1779.23] - - [64, 256, 1, 256] - - [390, 655.46] + - [462, 655.46] - - [1408, 4, 1, 128] - - [453, 20.1249] + - [525, 20.1249] - - [128, 2368, 1, 128] - - [353, 1707.73] + - [425, 1707.73] - - [256, 704, 1, 1280] - - [432, 3735.31] + - [504, 3735.31] - - [64, 2368, 1, 128] - - [360, 1049.81] + - [432, 1049.81] - - [6784, 6784, 1, 3328] - - [437, 9277.94] + - [509, 9277.94] - - [448, 5888, 1, 1280] - - [443, 7319.75] + - [515, 7319.75] - - [5056, 448, 1, 128] - - [425, 3694.43] + - [497, 3694.43] - - [4288, 704, 1, 1280] - - [435, 7890.96] + - [507, 7890.96] - - [3584, 2944, 1, 128] - - [427, 4124.71] + - [499, 4124.71] - - [6784, 256, 1, 1280] - - [443, 7185.83] + - [515, 7185.83] - - [256, 2944, 1, 1280] - - [432, 6736.76] + - [504, 6736.76] - - [64, 4288, 1, 128] - - [351, 1614.41] + - [423, 1614.41] - - [2368, 5888, 1, 3328] - - [433, 8616.46] + - [505, 8616.46] - - [4, 64, 1, 256] - - [370, 11.4778] + - [442, 11.4778] - - [704, 1024, 1, 3328] - - [438, 6801.92] + - [510, 6801.92] - - [2368, 1856, 1, 1280] - - [435, 7853.57] + - [507, 7853.57] - - [448, 5056, 1, 3328] - - [438, 7453.04] + - [510, 7453.04] - - [128, 448, 1, 128] - - [353, 530.449] + - [425, 530.449] - - [128, 6784, 1, 256] - - [433, 5557.55] + - [505, 5557.55] - - [3584, 4288, 1, 128] - - [424, 4462.73] + - [496, 4462.73] - - [64, 448, 1, 128] - - [353, 278.132] + - [425, 278.132] - - [5888, 4288, 1, 3328] - - [430, 9153.55] + - [502, 9153.55] - - [2368, 704, 1, 256] - - [437, 5350.78] + - [509, 5350.78] - - [256, 1856, 1, 3328] - - [432, 6536.35] + - [504, 6536.35] - - [1856, 128, 1, 256] - - [446, 2847.36] + - [518, 2847.36] - - [6784, 128, 1, 128] - - [426, 2530.82] + - [498, 2530.82] - - [3584, 1408, 1, 128] - - [427, 3625.62] + - [499, 3625.62] - - [1856, 5056, 1, 1280] - - [433, 8123.39] + - [505, 8123.39] - - [2944, 1024, 1, 1280] - - [443, 8450.41] + - [515, 8450.41] - - [5056, 4, 1, 256] - - [457, 380.787] + - [529, 380.787] - - [3584, 5888, 1, 3328] - - [435, 8567.99] + - [507, 8567.99] - - [2368, 4288, 1, 256] - - [439, 7858.07] + - [511, 7858.07] - - [1024, 2368, 1, 3328] - - [433, 6776.45] + - [505, 6776.45] - - [64, 704, 1, 3328] - - [388, 3503.52] + - [460, 3503.52] - - [704, 1408, 1, 256] - - [433, 6099.99] + - [505, 6099.99] - - [4096, 128, 1, 4096] - - [413, 4116.57] + - [485, 4116.57] - - [1024, 3584, 1, 1280] - - [443, 7231.65] + - [515, 7231.65] - - [4288, 5888, 1, 3328] - - [437, 8762.42] + - [509, 8762.42] - - [4288, 4, 1, 1280] - - [390, 492.797] + - [462, 492.797] - - [4608, 16, 1, 1536] - - [391, 1892.58] + - [463, 1892.58] - - [5888, 64, 1, 128] - - [368, 1747.73] + - [440, 1747.73] - - [4, 5888, 1, 128] - - [453, 84.5915] + - [525, 84.5915] - - [1024, 2944, 1, 3328] - - [441, 6907.05] + - [513, 6907.05] - - [6784, 1856, 1, 256] - - [437, 6274.07] + - [509, 6274.07] - - [2048, 64, 1, 2048] - - [420, 2371.44] + - [492, 2371.44] - - [256, 6784, 1, 1280] - - [437, 7067.04] + - [509, 7067.04] - - [1856, 3584, 1, 256] - - [443, 7706.87] + - [515, 7706.87] - - [128, 448, 1, 3328] - - [388, 3995.93] + - [460, 3995.93] - - [6784, 1856, 1, 128] - - [425, 4459.09] + - [497, 4459.09] - - [4, 448, 1, 256] - - [390, 84.4294] + - [462, 84.4294] - - [5056, 128, 1, 256] - - [438, 4954.5] + - [510, 4954.5] - - [512, 24000, 1, 2816] - - [431, 8994.98] + - [503, 8994.98] - - [256, 5888, 1, 1280] - - [430, 6184.0] + - [502, 6184.0] - - [4, 128, 1, 1280] - - [458, 71.9597] + - [530, 71.9597] - - [16384, 1600, 1, 4096] - - [437, 6921.09] + - [509, 6921.09] - - [6784, 128, 1, 1280] - - [441, 6486.37] + - [513, 6486.37] - - [64, 1408, 1, 256] - - [378, 1647.86] + - [450, 1647.86] - - [2368, 1408, 1, 128] - - [425, 3937.1] + - [497, 3937.1] - - [1856, 448, 1, 256] - - [438, 4635.57] + - [510, 4635.57] - - [1408, 1024, 1, 128] - - [421, 3208.51] + - [493, 3208.51] - - [128, 64, 1, 128] - - [350, 70.192] + - [422, 70.192] - - [6784, 3584, 1, 3328] - - [443, 8466.28] + - [515, 8466.28] - - [1760, 7000, 1, 1760] - - [441, 8149.21] + - [513, 8149.21] - - [2944, 64, 1, 3328] - - [374, 5018.09] + - [446, 5018.09] - - [64, 64, 1, 128] - - [350, 35.5249] + - [422, 35.5249] - - [2368, 5056, 1, 1280] - - [437, 8764.0] + - [509, 8764.0] - - [64, 4, 1, 1280] - - [459, 43.6745] + - [531, 43.6745] - - [1408, 2368, 1, 1280] - - [438, 7660.38] + - [510, 7660.38] - - [128, 1408, 1, 1280] - - [373, 4185.27] + - [445, 4185.27] - - [256, 64, 1, 3328] - - [398, 2071.75] + - [470, 2071.75] - - [704, 4288, 1, 128] - - [421, 4069.18] + - [493, 4069.18] - - [128, 1856, 1, 3328] - - [404, 5776.15] + - [476, 5776.15] - - [2944, 2944, 1, 256] - - [443, 7949.31] + - [515, 7949.31] - - [2944, 4, 1, 1280] - - [390, 483.218] + - [462, 483.218] - - [5888, 4, 1, 256] - - [375, 396.765] + - [447, 396.765] - - [6784, 256, 1, 256] - - [449, 4044.83] + - [521, 4044.83] - - [256, 5056, 1, 3328] - - [432, 7607.37] + - [504, 7607.37] - - [128, 4288, 1, 1280] - - [373, 4958.78] + - [445, 4958.78] - - [5056, 1856, 1, 128] - - [425, 4560.94] + - [497, 4560.94] - - [5056, 1024, 1, 3328] - - [437, 8634.18] + - [509, 8634.18] - - [128, 128, 1, 256] - - [375, 699.151] + - [447, 699.151] - - [1760, 64, 1, 1760] - - [381, 4580.65] + - [453, 4580.65] - - [4288, 3584, 1, 3328] - - [443, 9143.76] + - [515, 9143.76] - - [448, 704, 1, 3328] - - [432, 4473.43] + - [504, 4473.43] - - [448, 448, 1, 128] - - [363, 1264.38] + - [435, 1264.38] - - [1024, 2368, 1, 1280] - - [441, 7452.51] + - [513, 7452.51] - - [1856, 704, 1, 3328] - - [432, 6103.34] + - [504, 6103.34] - - [4, 2368, 1, 128] - - [452, 96.019] + - [524, 96.019] - - [5888, 6784, 1, 3328] - - [437, 9131.74] + - [509, 9131.74] - - [704, 4288, 1, 1280] - - [439, 7906.46] + - [511, 7906.46] - - [704, 256, 1, 256] - - [432, 2772.78] + - [504, 2772.78] - - [1024, 48000, 1, 2048] - - [436, 6513.45] + - [508, 6513.45] - - [4288, 1024, 1, 128] - - [421, 4291.77] + - [493, 4291.77] - - [512, 2048, 1, 49] - - [467, 4555.08] + - [539, 4555.08] - - [512, 128, 1, 784] - - [460, 3195.39] + - [532, 3195.39] - - [2048, 512, 1, 49] - - [468, 4253.43] + - [540, 4253.43] - - [1024, 256, 1, 196] - - [464, 4039.43] + - [536, 4039.43] - - [256, 64, 1, 3136] - - [462, 3015.37] + - [534, 3015.37] - - [256, 1024, 1, 196] - - [466, 4225.45] + - [538, 4225.45] - - [64, 256, 1, 3136] - - [463, 3058.45] + - [535, 3058.45] - - [128, 512, 1, 784] - - [461, 3380.38] + - [533, 3380.38] - - [64, 64, 1, 3136] - - [465, 1372.44] + - [537, 1372.44] - - [1024, 1024, 1, 3328] - - [578, 8705.1] + - [650, 8705.1] - - [2048, 200, 1, 3200] - - [583, 6173.42] + - [655, 6173.42] - - [1024, 200, 1, 13312] - - [481, 5213.31] + - [553, 5213.31] - - [1024, 256, 1, 1536] - - [583, 5859.43] + - [655, 5859.43] - - [4096, 256, 1, 12288] - - [588, 8807.52] + - [660, 8807.52] - - [64, 200, 1, 1024] - - [555, 366.632] + - [627, 366.632] - - [32, 512, 1, 1024] - - [510, 453.049] + - [582, 453.049] - - [2048, 256, 1, 3328] - - [572, 7876.73] + - [644, 7876.73] - - [4096, 512, 1, 32] - - [576, 3975.74] + - [648, 3975.74] - - [2048, 256, 1, 13312] - - [553, 7837.81] + - [625, 7837.81] - - [4096, 200, 1, 11264] - - [588, 6902.76] + - [660, 6902.76] - - [2048, 512, 1, 1024] - - [582, 8100.14] + - [654, 8100.14] - - [2048, 1024, 1, 1664] - - [482, 9082.08] + - [554, 9082.08] - - [1024, 1024, 1, 64] - - [578, 4258.28] + - [650, 4258.28] - - [512, 1024, 1, 1536] - - [572, 7597.33] + - [644, 7597.33] - - [1024, 256, 1, 15360] - - [473, 6735.24] + - [545, 6735.24] - - [1, 512, 1, 1024] - - [523, 15.1657] + - [595, 15.1657] - - [4096, 512, 1, 1408] - - [485, 9024.52] + - [557, 9024.52] - - [1024, 200, 1, 1408] - - [583, 4461.09] + - [655, 4461.09] - - [1024, 512, 1, 512] - - [577, 6528.2] + - [649, 6528.2] - - [4096, 256, 1, 15360] - - [584, 8824.03] + - [656, 8824.03] - - [2048, 512, 1, 640] - - [574, 7989.25] + - [646, 7989.25] - - [4096, 1024, 1, 1280] - - [480, 9421.54] + - [552, 9421.54] - - [1024, 200, 1, 6144] - - [572, 4966.52] + - [644, 4966.52] - - [1024, 1024, 1, 512] - - [574, 7731.54] + - [646, 7731.54] - - [128, 512, 1, 2048] - - [490, 2190.34] + - [562, 2190.34] - - [2048, 1024, 1, 640] - - [480, 8581.8] + - [552, 8581.8] - - [1024, 256, 1, 3328] - - [572, 6192.71] + - [644, 6192.71] - - [4096, 1024, 1, 13312] - - [485, 9642.59] + - [557, 9642.59] - - [2048, 256, 1, 2048] - - [572, 7485.75] + - [644, 7485.75] - - [2048, 1024, 1, 13312] - - [485, 9352.26] + - [557, 9352.26] - - [2048, 512, 1, 16640] - - [573, 8839.17] + - [645, 8839.17] - - [1024, 512, 1, 128] - - [577, 4280.0] + - [649, 4280.0] - - [2048, 1024, 1, 3584] - - [480, 9264.72] + - [552, 9264.72] - - [2048, 512, 1, 256] - - [588, 6990.61] + - [660, 6990.61] - - [512, 256, 1, 3200] - - [535, 4154.52] + - [607, 4154.52] - - [4096, 1024, 1, 1920] - - [480, 9535.32] + - [552, 9535.32] - - [4096, 200, 1, 2560] - - [585, 6754.65] + - [657, 6754.65] - - [1024, 256, 1, 16384] - - [475, 6289.6] + - [547, 6289.6] - - [1024, 1024, 1, 1152] - - [578, 8407.39] + - [650, 8407.39] - - [2048, 200, 1, 32] - - [521, 1412.51] + - [593, 1412.51] - - [512, 1024, 1, 2816] - - [572, 7843.25] + - [644, 7843.25] - - [4096, 256, 1, 14336] - - [584, 8844.77] + - [656, 8844.77] - - [1024, 200, 1, 4608] - - [583, 4931.74] + - [655, 4931.74] - - [1024, 200, 1, 16384] - - [478, 5135.15] + - [550, 5135.15] - - [64, 256, 1, 1024] - - [556, 461.013] + - [628, 461.013] - - [1, 200, 1, 1024] - - [538, 7.49884] + - [610, 7.49884] - - [2048, 200, 1, 2080] - - [583, 6033.87] + - [655, 6033.87] - - [512, 256, 1, 1792] - - [493, 3153.71] + - [565, 3153.71] - - [2048, 200, 1, 1024] - - [583, 5711.3] + - [655, 5711.3] - - [4096, 1024, 1, 12288] - - [480, 9658.23] + - [552, 9658.23] - - [4096, 200, 1, 4096] - - [574, 6834.55] + - [646, 6834.55] - - [1024, 512, 1, 11264] - - [541, 7686.46] + - [613, 7686.46] - - [128, 512, 1, 1024] - - [511, 1458.99] + - [583, 1458.99] - - [32, 256, 1, 2048] - - [529, 384.899] + - [601, 384.899] - - [1024, 200, 1, 1792] - - [583, 4638.64] + - [655, 4638.64] - - [1024, 1024, 1, 1792] - - [578, 8550.56] + - [650, 8550.56] - - [32, 256, 1, 512] - - [562, 161.419] + - [634, 161.419] - - [512, 200, 1, 2816] - - [488, 3353.1] + - [560, 3353.1] - - [512, 200, 1, 3072] - - [473, 3298.89] + - [545, 3298.89] - - [1024, 1024, 1, 8192] - - [519, 8369.1] + - [591, 8369.1] - - [1024, 256, 1, 12288] - - [476, 6475.71] + - [548, 6475.71] - - [4096, 200, 1, 768] - - [578, 6367.97] + - [650, 6367.97] - - [1024, 512, 1, 16384] - - [594, 7367.12] + - [666, 7367.12] - - [4096, 256, 1, 1024] - - [574, 8214.16] + - [646, 8214.16] - - [1024, 512, 1, 256] - - [577, 5537.13] + - [649, 5537.13] - - [4096, 1024, 1, 8320] - - [480, 9674.26] + - [552, 9674.26] - - [4096, 256, 1, 9216] - - [582, 8791.02] + - [654, 8791.02] - - [1024, 512, 1, 1408] - - [572, 7459.65] + - [644, 7459.65] - - [1024, 512, 1, 5632] - - [583, 7997.91] + - [655, 7997.91] - - [4096, 200, 1, 256] - - [588, 5371.9] + - [660, 5371.9] - - [1024, 200, 1, 128] - - [566, 1998.15] + - [638, 1998.15] - - [256, 200, 1, 1024] - - [535, 1196.01] + - [607, 1196.01] - - [1024, 200, 1, 5120] - - [583, 4957.44] + - [655, 4957.44] - - [512, 1024, 1, 3072] - - [596, 7104.07] + - [668, 7104.07] - - [4096, 1024, 1, 15360] - - [480, 9669.04] + - [552, 9669.04] - - [1, 256, 1, 2048] - - [522, 13.9262] + - [594, 13.9262] - - [1024, 1024, 1, 4160] - - [574, 8759.3] + - [646, 8759.3] - - [1024, 256, 1, 256] - - [581, 3728.37] + - [653, 3728.37] - - [2048, 256, 1, 384] - - [583, 6123.17] + - [655, 6123.17] - - [512, 256, 1, 2560] - - [537, 3809.64] + - [609, 3809.64] - - [4096, 512, 1, 3072] - - [485, 9215.19] + - [557, 9215.19] - - [1024, 256, 1, 4160] - - [572, 6293.49] + - [644, 6293.49] - - [4096, 512, 1, 13312] - - [482, 9367.32] + - [554, 9367.32] - - [4096, 1024, 1, 3840] - - [480, 9631.57] + - [552, 9631.57] - - [4096, 200, 1, 640] - - [578, 6206.16] + - [650, 6206.16] - - [32, 200, 1, 2048] - - [516, 303.507] + - [588, 303.507] - - [1024, 200, 1, 512] - - [572, 3713.19] + - [644, 3713.19] - - [1024, 1024, 1, 7168] - - [575, 8475.74] + - [647, 8475.74] - - [2048, 1024, 1, 3200] - - [480, 9271.34] + - [552, 9271.34] - - [512, 512, 1, 1536] - - [583, 5832.27] + - [655, 5832.27] - - [4096, 256, 1, 768] - - [588, 8066.07] + - [660, 8066.07] - - [2048, 256, 1, 6656] - - [572, 8034.87] + - [644, 8034.87] - - [1024, 256, 1, 896] - - [572, 5467.54] + - [644, 5467.54] - - [2048, 256, 1, 512] - - [583, 6465.31] + - [655, 6465.31] - - [2048, 200, 1, 3072] - - [583, 6165.78] + - [655, 6165.78] - - [128, 200, 1, 1024] - - [540, 692.87] + - [612, 692.87] - - [4096, 512, 1, 3840] - - [485, 9272.7] + - [557, 9272.7] - - [1024, 200, 1, 3200] - - [583, 4838.85] + - [655, 4838.85] - - [4096, 512, 1, 5632] - - [480, 9335.52] + - [552, 9335.52] - - [4096, 512, 1, 64] - - [515, 5275.95] + - [587, 5275.95] - - [1024, 512, 1, 2816] - - [572, 7816.68] + - [644, 7816.68] - - [4096, 256, 1, 7680] - - [578, 8795.5] + - [650, 8795.5] - - [4096, 200, 1, 1024] - - [588, 6448.91] + - [660, 6448.91] - - [1024, 512, 1, 12288] - - [542, 7624.67] + - [614, 7624.67] - - [2048, 1024, 1, 512] - - [485, 8436.16] + - [557, 8436.16] - - [128, 256, 1, 2048] - - [559, 1342.28] + - [631, 1342.28] - - [2048, 200, 1, 1792] - - [583, 6020.47] + - [655, 6020.47] - - [1024, 1024, 1, 2816] - - [574, 8670.5] + - [646, 8670.5] - - [2048, 512, 1, 1536] - - [585, 8466.32] + - [657, 8466.32] - - [4096, 256, 1, 3072] - - [582, 8631.47] + - [654, 8631.47] - - [1024, 200, 1, 1536] - - [564, 4577.7] + - [636, 4577.7] - - [1024, 256, 1, 1024] - - [572, 5491.82] + - [644, 5491.82] - - [4096, 512, 1, 8192] - - [485, 9325.64] + - [557, 9325.64] - - [128, 1024, 1, 512] - - [583, 2534.42] + - [655, 2534.42] - - [4096, 512, 1, 2304] - - [480, 9193.09] + - [552, 9193.09] - - [2048, 256, 1, 5632] - - [583, 7999.64] + - [655, 7999.64] - - [1024, 256, 1, 5120] - - [583, 6307.32] + - [655, 6307.32] - - [1024, 512, 1, 6656] - - [583, 8028.95] + - [655, 8028.95] - - [4096, 512, 1, 2816] - - [480, 9234.5] + - [552, 9234.5] - - [4096, 200, 1, 2080] - - [567, 6697.96] + - [639, 6697.96] - - [1024, 200, 1, 2304] - - [583, 4752.91] + - [655, 4752.91] - - [2048, 200, 1, 13312] - - [572, 6346.23] + - [644, 6346.23] - - [64, 1024, 1, 1024] - - [556, 1359.68] + - [628, 1359.68] - - [4096, 256, 1, 3584] - - [578, 8668.9] + - [650, 8668.9] - - [2048, 1024, 1, 7680] - - [480, 9365.88] + - [552, 9365.88] - - [1024, 256, 1, 1664] - - [572, 5907.57] + - [644, 5907.57] - - [1, 512, 1, 2048] - - [499, 23.5057] + - [571, 23.5057] - - [512, 512, 1, 1024] - - [572, 5360.23] + - [644, 5360.23] - - [2048, 256, 1, 8192] - - [544, 7665.31] + - [616, 7665.31] - - [2048, 512, 1, 512] - - [574, 7767.33] + - [646, 7767.33] - - [4096, 512, 1, 1920] - - [480, 9133.04] + - [552, 9133.04] - - [4096, 200, 1, 12288] - - [588, 6910.75] + - [660, 6910.75] - - [1024, 512, 1, 3072] - - [518, 7310.43] + - [590, 7310.43] - - [2048, 512, 1, 1152] - - [578, 8342.36] + - [650, 8342.36] - - [1024, 256, 1, 2080] - - [572, 6010.46] + - [644, 6010.46] - - [4096, 1024, 1, 32] - - [568, 4793.59] + - [640, 4793.59] - - [4096, 512, 1, 16640] - - [480, 9365.41] + - [552, 9365.41] - - [2048, 200, 1, 9216] - - [572, 6315.98] + - [644, 6315.98] - - [2048, 200, 1, 2560] - - [572, 6119.24] + - [644, 6119.24] - - [2048, 1024, 1, 1024] - - [480, 8628.69] + - [552, 8628.69] - - [2048, 256, 1, 4608] - - [572, 7951.39] + - [644, 7951.39] - - [512, 200, 1, 768] - - [524, 2132.51] + - [596, 2132.51] - - [128, 256, 1, 512] - - [524, 670.117] + - [596, 670.117] - - [4096, 512, 1, 1792] - - [485, 9127.01] + - [557, 9127.01] - - [4096, 1024, 1, 8192] - - [480, 9591.37] + - [552, 9591.37] - - [1024, 256, 1, 2816] - - [583, 6119.11] + - [655, 6119.11] - - [1024, 1024, 1, 13312] - - [575, 8529.37] + - [647, 8529.37] - - [2048, 1024, 1, 4160] - - [480, 9305.67] + - [552, 9305.67] - - [2048, 256, 1, 3584] - - [572, 7903.23] + - [644, 7903.23] - - [128, 200, 1, 2048] - - [540, 1135.91] + - [612, 1135.91] - - [4096, 512, 1, 10240] - - [482, 9339.59] + - [554, 9339.59] - - [4096, 512, 1, 512] - - [480, 8446.78] + - [552, 8446.78] - - [2048, 1024, 1, 6656] - - [480, 9331.75] + - [552, 9331.75] - - [1024, 512, 1, 640] - - [572, 6776.04] + - [644, 6776.04] - - [2048, 512, 1, 768] - - [574, 8085.51] + - [646, 8085.51] - - [2048, 200, 1, 1408] - - [572, 5880.17] + - [644, 5880.17] - - [4096, 200, 1, 2048] - - [588, 6691.71] + - [660, 6691.71] - - [1024, 1024, 1, 5632] - - [574, 8749.63] + - [646, 8749.63] - - [2048, 512, 1, 3584] - - [578, 8704.23] + - [650, 8704.23] - - [64, 512, 1, 512] - - [514, 667.983] + - [586, 667.983] - - [64, 200, 1, 512] - - [524, 251.388] + - [596, 251.388] - - [1024, 200, 1, 64] - - [479, 1310.82] + - [551, 1310.82] - - [512, 512, 1, 2304] - - [572, 6078.8] + - [644, 6078.8] - - [2048, 1024, 1, 14336] - - [480, 9321.94] + - [552, 9321.94] - - [4096, 512, 1, 11264] - - [482, 9339.95] + - [554, 9339.95] - - [4096, 512, 1, 128] - - [567, 6566.53] + - [639, 6566.53] - - [1024, 512, 1, 64] - - [587, 2953.84] + - [659, 2953.84] - - [4096, 512, 1, 768] - - [480, 8738.23] + - [552, 8738.23] - - [4096, 1024, 1, 11264] - - [480, 9637.78] + - [552, 9637.78] - - [1, 256, 1, 1024] - - [570, 8.93234] + - [642, 8.93234] - - [4096, 200, 1, 7680] - - [567, 6889.57] + - [639, 6889.57] - - [1024, 200, 1, 12288] - - [539, 5237.74] + - [611, 5237.74] - - [1024, 1024, 1, 1280] - - [574, 8418.17] + - [646, 8418.17] - - [4096, 1024, 1, 16640] - - [480, 9675.01] + - [552, 9675.01] - - [2048, 1024, 1, 5632] - - [480, 9327.85] + - [552, 9327.85] - - [1024, 200, 1, 15360] - - [539, 5386.63] + - [611, 5386.63] - - [1, 1024, 1, 1024] - - [589, 27.3499] + - [661, 27.3499] - - [2048, 256, 1, 16384] - - [550, 7652.75] + - [622, 7652.75] - - [4096, 512, 1, 12288] - - [482, 9359.51] + - [554, 9359.51] - - [2048, 200, 1, 896] - - [583, 5628.96] + - [655, 5628.96] - - [4096, 1024, 1, 5632] - - [480, 9626.78] + - [552, 9626.78] - - [2048, 256, 1, 32] - - [576, 1889.43] + - [648, 1889.43] - - [2048, 256, 1, 1280] - - [572, 7390.94] + - [644, 7390.94] - - [4096, 256, 1, 4096] - - [574, 8694.37] + - [646, 8694.37] - - [2048, 256, 1, 11264] - - [572, 8113.95] + - [644, 8113.95] - - [4096, 200, 1, 9216] - - [574, 6891.08] + - [646, 6891.08] - - [1024, 512, 1, 4096] - - [520, 7348.46] + - [592, 7348.46] - - [2048, 1024, 1, 10240] - - [482, 9095.91] + - [554, 9095.91] - - [4096, 1024, 1, 640] - - [480, 9115.68] + - [552, 9115.68] - - [128, 1024, 1, 2048] - - [473, 3270.51] + - [545, 3270.51] - - [4096, 200, 1, 3840] - - [567, 6836.26] + - [639, 6836.26] - - [1024, 1024, 1, 1920] - - [578, 8562.82] + - [650, 8562.82] - - [2048, 200, 1, 7168] - - [583, 6296.23] + - [655, 6296.23] - - [2048, 512, 1, 16384] - - [474, 8632.51] + - [546, 8632.51] - - [2048, 1024, 1, 12288] - - [480, 9158.08] + - [552, 9158.08] - - [4096, 1024, 1, 10240] - - [480, 9658.84] + - [552, 9658.84] - - [1024, 1024, 1, 8320] - - [582, 8799.58] + - [654, 8799.58] - - [1024, 256, 1, 9216] - - [572, 6375.23] + - [644, 6375.23] - - [4096, 256, 1, 1152] - - [567, 8301.09] + - [639, 8301.09] - - [512, 200, 1, 2560] - - [533, 3088.51] + - [605, 3088.51] - - [2048, 256, 1, 1920] - - [572, 7714.94] + - [644, 7714.94] - - [2048, 1024, 1, 4608] - - [480, 9305.7] + - [552, 9305.7] - - [512, 256, 1, 1024] - - [580, 2887.74] + - [652, 2887.74] - - [1024, 256, 1, 1920] - - [564, 5913.12] + - [636, 5913.12] - - [4096, 512, 1, 3584] - - [480, 9275.69] + - [552, 9275.69] - - [2048, 512, 1, 4160] - - [585, 8734.03] + - [657, 8734.03] - - [2048, 512, 1, 5632] - - [588, 8758.98] + - [660, 8758.98] - - [4096, 1024, 1, 4608] - - [480, 9657.22] + - [552, 9657.22] - - [4096, 1024, 1, 3328] - - [480, 9621.45] + - [552, 9621.45] - - [4096, 256, 1, 7168] - - [574, 8770.05] + - [646, 8770.05] - - [4096, 200, 1, 128] - - [588, 4458.33] + - [660, 4458.33] - - [2048, 200, 1, 5120] - - [572, 6176.91] + - [644, 6176.91] - - [1024, 1024, 1, 6656] - - [574, 8780.45] + - [646, 8780.45] - - [512, 1024, 1, 3200] - - [583, 7887.09] + - [655, 7887.09] - - [512, 200, 1, 2304] - - [473, 2991.09] + - [545, 2991.09] - - [2048, 1024, 1, 9216] - - [485, 9325.46] + - [557, 9325.46] - - [2048, 256, 1, 1536] - - [583, 7551.73] + - [655, 7551.73] - - [4096, 256, 1, 256] - - [588, 6932.83] + - [660, 6932.83] - - [2048, 512, 1, 1408] - - [585, 8430.86] + - [657, 8430.86] - - [1024, 256, 1, 384] - - [577, 4462.13] + - [649, 4462.13] - - [2048, 1024, 1, 2304] - - [480, 9174.94] + - [552, 9174.94] - - [4096, 512, 1, 6144] - - [482, 9284.25] + - [554, 9284.25] - - [1024, 200, 1, 14336] - - [471, 5268.57] + - [543, 5268.57] - - [1024, 512, 1, 2080] - - [583, 7736.47] + - [655, 7736.47] - - [2048, 512, 1, 2304] - - [585, 8616.07] + - [657, 8616.07] - - [4096, 512, 1, 15360] - - [485, 9362.17] + - [557, 9362.17] - - [1024, 256, 1, 32] - - [505, 1028.12] + - [577, 1028.12] - - [1024, 200, 1, 2816] - - [583, 4780.58] + - [655, 4780.58] - - [4096, 200, 1, 512] - - [574, 6054.23] + - [646, 6054.23] - - [4096, 1024, 1, 7168] - - [485, 9468.49] + - [557, 9468.49] - - [2048, 256, 1, 14336] - - [546, 7865.52] + - [618, 7865.52] - - [1024, 200, 1, 3072] - - [583, 4804.2] + - [655, 4804.2] - - [2048, 200, 1, 1280] - - [583, 5846.31] + - [655, 5846.31] - - [1024, 1024, 1, 2304] - - [574, 8633.32] + - [646, 8633.32] - - [4096, 1024, 1, 9216] - - [480, 9641.03] + - [552, 9641.03] - - [2048, 512, 1, 4608] - - [585, 8743.3] + - [657, 8743.3] - - [4096, 1024, 1, 7680] - - [480, 9684.86] + - [552, 9684.86] - - [4096, 256, 1, 6144] - - [585, 8757.24] + - [657, 8757.24] - - [4096, 256, 1, 896] - - [578, 8258.93] + - [650, 8258.93] - - [512, 256, 1, 1536] - - [562, 3065.36] + - [634, 3065.36] - - [1024, 256, 1, 512] - - [572, 4752.85] + - [644, 4752.85] - - [2048, 256, 1, 640] - - [572, 6776.04] + - [644, 6776.04] - - [256, 256, 1, 2048] - - [509, 2249.06] + - [581, 2249.06] - - [2048, 1024, 1, 8192] - - [480, 9178.17] + - [552, 9178.17] - - [4096, 200, 1, 16640] - - [472, 7009.59] + - [544, 7009.59] - - [256, 512, 1, 512] - - [484, 2511.66] + - [556, 2511.66] - - [2048, 512, 1, 384] - - [585, 7467.7] + - [657, 7467.7] - - [2048, 200, 1, 16384] - - [553, 6327.31] + - [625, 6327.31] - - [4096, 200, 1, 10240] - - [578, 6892.74] + - [650, 6892.74] - - [1024, 512, 1, 9216] - - [527, 7530.09] + - [599, 7530.09] - - [4096, 1024, 1, 64] - - [502, 6260.26] + - [574, 6260.26] - - [4096, 200, 1, 1920] - - [588, 6710.27] + - [660, 6710.27] - - [2048, 1024, 1, 1280] - - [480, 8998.34] + - [552, 8998.34] - - [1024, 200, 1, 3840] - - [572, 4873.87] + - [644, 4873.87] - - [256, 1024, 1, 512] - - [583, 4766.35] + - [655, 4766.35] - - [2048, 1024, 1, 3328] - - [480, 9275.2] + - [552, 9275.2] - - [1024, 256, 1, 16640] - - [537, 6837.22] + - [609, 6837.22] - - [4096, 512, 1, 14336] - - [485, 9354.42] + - [557, 9354.42] - - [1024, 1024, 1, 16640] - - [582, 8832.37] + - [654, 8832.37] - - [1024, 256, 1, 1152] - - [583, 5642.66] + - [655, 5642.66] - - [512, 512, 1, 512] - - [572, 4779.93] + - [644, 4779.93] - - [4096, 512, 1, 8320] - - [485, 9327.96] + - [557, 9327.96] - - [2048, 512, 1, 7680] - - [588, 8793.96] + - [660, 8793.96] - - [4096, 1024, 1, 6656] - - [480, 9667.03] + - [552, 9667.03] - - [1024, 512, 1, 3584] - - [583, 7900.57] + - [655, 7900.57] - - [1024, 1024, 1, 32] - - [568, 2974.78] + - [640, 2974.78] - - [512, 512, 1, 2816] - - [564, 6155.85] + - [636, 6155.85] - - [2048, 512, 1, 1664] - - [588, 8496.55] + - [660, 8496.55] - - [1024, 1024, 1, 14336] - - [474, 8624.74] + - [546, 8624.74] - - [2048, 200, 1, 2048] - - [583, 6029.86] + - [655, 6029.86] - - [1024, 1024, 1, 3584] - - [574, 8702.62] + - [646, 8702.62] - - [512, 200, 1, 1280] - - [488, 2350.75] + - [560, 2350.75] - - [4096, 256, 1, 6656] - - [588, 8788.41] + - [660, 8788.41] - - [4096, 256, 1, 4160] - - [565, 8728.44] + - [637, 8728.44] - - [128, 256, 1, 1024] - - [547, 859.589] + - [619, 859.589] - - [512, 200, 1, 3200] - - [488, 3376.85] + - [560, 3376.85] - - [2048, 512, 1, 9216] - - [571, 8806.4] + - [643, 8806.4] - - [2048, 1024, 1, 256] - - [567, 7713.76] + - [639, 7713.76] - - [1024, 256, 1, 2304] - - [583, 6015.83] + - [655, 6015.83] - - [1024, 200, 1, 8192] - - [583, 5022.02] + - [655, 5022.02] - - [2048, 256, 1, 3072] - - [500, 7515.09] + - [572, 7515.09] - - [2048, 256, 1, 8320] - - [572, 8063.68] + - [644, 8063.68] - - [4096, 512, 1, 1024] - - [482, 8824.41] + - [554, 8824.41] - - [1024, 512, 1, 3200] - - [572, 7866.39] + - [644, 7866.39] - - [1024, 512, 1, 896] - - [564, 7161.11] + - [636, 7161.11] - - [2048, 512, 1, 1280] - - [578, 8384.52] + - [650, 8384.52] - - [4096, 200, 1, 64] - - [487, 3260.6] + - [559, 3260.6] - - [1024, 256, 1, 6144] - - [593, 6143.72] + - [665, 6143.72] - - [1024, 200, 1, 2560] - - [572, 4762.89] + - [644, 4762.89] - - [1024, 1024, 1, 5120] - - [501, 8454.23] + - [573, 8454.23] - - [2048, 512, 1, 6656] - - [578, 8799.05] + - [650, 8799.05] - - [4096, 1024, 1, 1536] - - [480, 9503.37] + - [552, 9503.37] - - [1024, 1024, 1, 128] - - [503, 5825.52] + - [575, 5825.52] - - [512, 1024, 1, 1792] - - [572, 7701.12] + - [644, 7701.12] - - [2048, 1024, 1, 32] - - [483, 3938.41] + - [555, 3938.41] - - [4096, 256, 1, 2816] - - [567, 8652.2] + - [639, 8652.2] - - [1024, 1024, 1, 15360] - - [474, 8719.7] + - [546, 8719.7] - - [1024, 256, 1, 5632] - - [572, 6344.18] + - [644, 6344.18] - - [1024, 1024, 1, 4096] - - [575, 8187.86] + - [647, 8187.86] - - [2048, 200, 1, 4160] - - [583, 6222.48] + - [655, 6222.48] - - [512, 256, 1, 768] - - [514, 2771.67] + - [586, 2771.67] - - [4096, 512, 1, 640] - - [485, 8590.58] + - [557, 8590.58] - - [2048, 512, 1, 8192] - - [527, 8494.9] + - [599, 8494.9] - - [1024, 512, 1, 768] - - [572, 7049.35] + - [644, 7049.35] - - [4096, 200, 1, 8320] - - [567, 6908.7] + - [639, 6908.7] - - [2048, 512, 1, 896] - - [574, 8224.23] + - [646, 8224.23] - - [4096, 200, 1, 7168] - - [585, 6878.59] + - [657, 6878.59] - - [2048, 512, 1, 13312] - - [573, 8803.04] + - [645, 8803.04] - - [64, 512, 1, 1024] - - [477, 844.024] + - [549, 844.024] - - [2048, 200, 1, 3840] - - [572, 6192.48] + - [644, 6192.48] - - [1024, 1024, 1, 768] - - [565, 8098.51] + - [637, 8098.51] - - [4096, 512, 1, 16384] - - [485, 9345.73] + - [557, 9345.73] - - [4096, 256, 1, 2304] - - [565, 8596.45] + - [637, 8596.45] - - [1, 256, 1, 4096] - - [570, 19.9293] + - [642, 19.9293] - - [1024, 1024, 1, 11264] - - [575, 8491.48] + - [647, 8491.48] - - [2048, 200, 1, 16640] - - [569, 6510.64] + - [641, 6510.64] - - [1024, 256, 1, 3072] - - [583, 6179.55] + - [655, 6179.55] - - [4096, 1024, 1, 512] - - [480, 9032.25] + - [552, 9032.25] - - [2048, 256, 1, 2816] - - [572, 7793.57] + - [644, 7793.57] - - [32, 512, 1, 512] - - [484, 318.816] + - [556, 318.816] - - [256, 512, 1, 2048] - - [535, 3369.02] + - [607, 3369.02] - - [1024, 512, 1, 384] - - [583, 6198.58] + - [655, 6198.58] - - [2048, 200, 1, 7680] - - [572, 6307.7] + - [644, 6307.7] - - [1024, 512, 1, 4608] - - [583, 7953.48] - - - [2048, 256, 1, 768] - - [583, 7059.24] + - [655, 7953.48] - - [4096, 200, 1, 32] - - [532, 2199.29] + - [604, 2199.29] - - [4096, 200, 1, 3328] - - [567, 6813.12] + - [639, 6813.12] - - [1024, 200, 1, 1152] - - [572, 4375.65] + - [644, 4375.65] - - [1024, 1024, 1, 1408] - - [574, 8457.91] + - [646, 8457.91] - - [2048, 200, 1, 15360] - - [548, 6333.1] + - [620, 6333.1] - - [512, 1024, 1, 2048] - - [558, 6280.76] + - [630, 6280.76] - - [1024, 512, 1, 1024] - - [583, 7064.19] + - [655, 7064.19] - - [1024, 200, 1, 10240] - - [572, 5030.69] + - [644, 5030.69] - - [4096, 256, 1, 5632] - - [585, 8765.22] + - [657, 8765.22] - - [512, 512, 1, 3072] - - [595, 5942.44] + - [667, 5942.44] - - [2048, 256, 1, 1408] - - [572, 7545.05] + - [644, 7545.05] - - [2048, 256, 1, 6144] - - [583, 7963.97] + - [655, 7963.97] - - [4096, 256, 1, 3328] - - [578, 8682.58] + - [650, 8682.58] - - [1024, 200, 1, 1664] - - [572, 4595.4] + - [644, 4595.4] - - [2048, 1024, 1, 1152] - - [480, 8942.65] + - [552, 8942.65] - - [2048, 512, 1, 6144] - - [573, 8729.71] + - [645, 8729.71] - - [2048, 512, 1, 3200] - - [574, 8696.56] + - [646, 8696.56] - - [4096, 1024, 1, 2080] - - [513, 9538.45] + - [585, 9538.45] - - [4096, 1024, 1, 768] - - [480, 9260.75] + - [552, 9260.75] - - [4096, 1024, 1, 2560] - - [480, 9567.27] + - [552, 9567.27] - - [64, 200, 1, 2048] - - [512, 583.161] + - [584, 583.161] - - [2048, 200, 1, 4608] - - [583, 6243.28] + - [655, 6243.28] - - [1024, 1024, 1, 6144] - - [575, 8320.25] + - [647, 8320.25] - - [4096, 256, 1, 1664] - - [578, 8503.17] + - [650, 8503.17] - - [2048, 200, 1, 384] - - [583, 4940.0] + - [655, 4940.0] - - [1, 200, 1, 2048] - - [529, 11.3281] + - [601, 11.3281] - - [4096, 256, 1, 1792] - - [588, 8504.12] + - [660, 8504.12] - - [2048, 1024, 1, 64] - - [502, 5309.35] + - [574, 5309.35] - - [4096, 1024, 1, 16384] - - [469, 9428.61] + - [541, 9428.61] - - [1024, 512, 1, 16640] - - [583, 8122.55] + - [655, 8122.55] - - [2048, 512, 1, 10240] - - [573, 8766.21] + - [645, 8766.21] - - [4096, 512, 1, 6656] - - [480, 9351.75] + - [552, 9351.75] - - [2048, 256, 1, 16640] - - [572, 8135.27] + - [644, 8135.27] - - [2048, 512, 1, 2816] - - [574, 8660.32] + - [646, 8660.32] - - [1024, 200, 1, 32] - - [492, 780.291] + - [564, 780.291] - - [1, 512, 1, 4096] - - [517, 34.8671] + - [589, 34.8671] - - [256, 256, 1, 1024] - - [524, 1490.08] + - [596, 1490.08] - - [2048, 1024, 1, 128] - - [497, 6605.3] + - [569, 6605.3] - - [2048, 1024, 1, 2080] - - [480, 9159.51] + - [552, 9159.51] - - [2048, 1024, 1, 16640] - - [480, 9371.65] + - [552, 9371.65] - - [1024, 200, 1, 384] - - [583, 3378.24] + - [655, 3378.24] - - [4096, 256, 1, 384] - - [528, 7369.3] + - [600, 7369.3] - - [4096, 256, 1, 13312] - - [582, 8776.48] + - [654, 8776.48] - - [2048, 256, 1, 128] - - [577, 4280.0] + - [649, 4280.0] - - [512, 256, 1, 2304] - - [489, 3584.98] + - [561, 3584.98] - - [2048, 1024, 1, 3072] - - [482, 9156.52] + - [554, 9156.52] - - [1024, 1024, 1, 640] - - [578, 7928.84] + - [650, 7928.84] - - [256, 512, 1, 1024] - - [583, 2843.7] + - [655, 2843.7] - - [4096, 1024, 1, 1408] - - [480, 9437.56] + - [552, 9437.56] - - [4096, 200, 1, 5632] - - [585, 6873.96] + - [657, 6873.96] - - [4096, 1024, 1, 2048] - - [480, 9437.1] + - [552, 9437.1] - - [2048, 1024, 1, 2560] - - [485, 9195.62] + - [557, 9195.62] - - [4096, 1024, 1, 128] - - [567, 7407.26] + - [639, 7407.26] - - [1024, 200, 1, 3328] - - [583, 4857.39] + - [655, 4857.39] - - [2048, 200, 1, 1152] - - [572, 5760.1] + - [644, 5760.1] - - [1024, 200, 1, 9216] - - [471, 5053.21] + - [543, 5053.21] - - [4096, 256, 1, 512] - - [565, 7617.45] + - [637, 7617.45] - - [4096, 1024, 1, 14336] - - [480, 9665.12] + - [552, 9665.12] - - [1024, 1024, 1, 384] - - [503, 7478.8] + - [575, 7478.8] - - [2048, 200, 1, 512] - - [572, 5150.28] + - [644, 5150.28] - - [2048, 256, 1, 9216] - - [551, 7717.71] + - [623, 7717.71] - - [2048, 256, 1, 1792] - - [572, 7655.94] + - [644, 7655.94] - - [4096, 512, 1, 9216] - - [482, 9331.22] + - [554, 9331.22] - - [4096, 200, 1, 15360] - - [472, 6958.14] + - [544, 6958.14] - - [1024, 512, 1, 2048] - - [571, 7067.91] + - [643, 7067.91] - - [64, 256, 1, 2048] - - [496, 723.256] + - [568, 723.256] - - [4096, 200, 1, 1792] - - [574, 6699.65] + - [646, 6699.65] - - [1, 200, 1, 4096] - - [506, 15.6387] + - [578, 15.6387] - - [2048, 1024, 1, 2048] - - [485, 9071.93] + - [557, 9071.93] - - [1024, 200, 1, 2080] - - [564, 4679.19] + - [636, 4679.19] - - [2048, 200, 1, 1536] - - [583, 5939.92] + - [655, 5939.92] - - [1024, 1024, 1, 3072] - - [545, 8333.15] + - [617, 8333.15] - - [512, 200, 1, 1792] - - [470, 2679.73] + - [542, 2679.73] - - [1024, 256, 1, 11264] - - [473, 6470.98] + - [545, 6470.98] - - [2048, 512, 1, 12288] - - [520, 8729.24] + - [592, 8729.24] - - [1024, 256, 1, 1792] - - [583, 5931.44] + - [655, 5931.44] - - [1024, 200, 1, 7168] - - [583, 4970.33] + - [655, 4970.33] - - [32, 256, 1, 1024] - - [494, 237.334] + - [566, 237.334] - - [512, 256, 1, 3072] - - [537, 3813.1] + - [609, 3813.1] - - [1024, 1024, 1, 2080] - - [574, 8600.41] + - [646, 8600.41] - - [2048, 200, 1, 2304] - - [583, 6093.32] + - [655, 6093.32] - - [4096, 512, 1, 1536] - - [480, 9075.0] + - [552, 9075.0] - - [2048, 256, 1, 7168] - - [583, 7895.26] + - [655, 7895.26] - - [2048, 512, 1, 1792] - - [585, 8531.92] + - [657, 8531.92] - - [1024, 200, 1, 2048] - - [572, 4685.43] + - [644, 4685.43] - - [1024, 1024, 1, 4608] - - [578, 8735.71] + - [650, 8735.71] - - [4096, 256, 1, 8192] - - [574, 8782.55] + - [646, 8782.55] - - [512, 1024, 1, 1280] - - [564, 7483.25] + - [636, 7483.25] - - [2048, 1024, 1, 16384] - - [474, 8878.96] + - [546, 8878.96] - - [512, 512, 1, 1280] - - [572, 5745.72] + - [644, 5745.72] - - [1024, 200, 1, 1280] - - [564, 4446.23] + - [636, 4446.23] - - [4096, 512, 1, 4096] - - [482, 9264.49] + - [554, 9264.49] - - [2048, 256, 1, 3200] - - [572, 7842.85] + - [644, 7842.85] - - [2048, 512, 1, 15360] - - [520, 8757.24] + - [592, 8757.24] - - [1024, 512, 1, 3328] - - [572, 7854.04] + - [644, 7854.04] - - [1024, 512, 1, 4160] - - [572, 7934.61] + - [644, 7934.61] - - [4096, 200, 1, 6656] - - [574, 6883.3] + - [646, 6883.3] - - [4096, 1024, 1, 1024] - - [480, 9229.44] + - [552, 9229.44] - - [2048, 200, 1, 3328] - - [583, 6182.74] + - [655, 6182.74] - - [1024, 1024, 1, 256] - - [503, 6932.83] + - [575, 6932.83] - - [512, 200, 1, 512] - - [524, 1910.77] + - [596, 1910.77] - - [2048, 256, 1, 64] - - [495, 2912.81] + - [567, 2912.81] - - [1024, 256, 1, 2560] - - [572, 6123.17] + - [644, 6123.17] - - [2048, 512, 1, 11264] - - [584, 8728.94] + - [656, 8728.94] - - [32, 200, 1, 1024] - - [579, 187.56] + - [651, 187.56] - - [32, 512, 1, 2048] - - [523, 694.521] + - [595, 694.521] - - [2048, 256, 1, 2304] - - [572, 7759.35] + - [644, 7759.35] - - [2048, 256, 1, 12288] - - [551, 7726.35] + - [623, 7726.35] - - [4096, 200, 1, 8192] - - [574, 6870.94] + - [646, 6870.94] - - [1024, 512, 1, 7168] - - [520, 7479.2] + - [592, 7479.2] - - [1024, 512, 1, 1792] - - [572, 7626.11] + - [644, 7626.11] - - [4096, 1024, 1, 1664] - - [480, 9503.54] + - [552, 9503.54] - - [4096, 200, 1, 2816] - - [567, 6775.44] + - [639, 6775.44] - - [1024, 1024, 1, 896] - - [574, 8229.99] + - [646, 8229.99] - - [1024, 200, 1, 8320] - - [535, 5173.58] + - [607, 5173.58] - - [1024, 1024, 1, 12288] - - [575, 8463.21] + - [647, 8463.21] - - [1024, 256, 1, 8320] - - [564, 6404.37] + - [636, 6404.37] - - [1024, 200, 1, 1024] - - [572, 4297.54] + - [644, 4297.54] - - [1024, 200, 1, 16640] - - [534, 5499.51] + - [606, 5499.51] - - [4096, 256, 1, 5120] - - [588, 8729.15] + - [660, 8729.15] - - [1024, 256, 1, 3200] - - [583, 6124.96] + - [655, 6124.96] - - [512, 512, 1, 2560] - - [583, 6109.79] + - [655, 6109.79] - - [4096, 256, 1, 2048] - - [588, 8511.05] + - [660, 8511.05] - - [1024, 256, 1, 640] - - [572, 5102.66] + - [644, 5102.66] - - [2048, 256, 1, 5120] - - [500, 7667.93] + - [572, 7667.93] - - [2048, 256, 1, 7680] - - [583, 8054.45] + - [655, 8054.45] - - [4096, 512, 1, 384] - - [578, 8190.77] + - [650, 8190.77] - - [2048, 200, 1, 3584] - - [572, 6166.12] + - [644, 6166.12] - - [1024, 512, 1, 1536] - - [572, 7517.9] + - [644, 7517.9] - - [4096, 512, 1, 3328] - - [480, 9259.45] + - [552, 9259.45] - - [4096, 1024, 1, 256] - - [480, 8341.79] + - [552, 8341.79] - - [2048, 200, 1, 64] - - [543, 2307.71] + - [615, 2307.71] - - [2048, 200, 1, 4096] - - [583, 6212.04] + - [655, 6212.04] - - [1024, 1024, 1, 1536] - - [574, 8484.15] + - [646, 8484.15] - - [2048, 1024, 1, 7168] - - [482, 9315.24] + - [554, 9315.24] - - [1024, 256, 1, 3584] - - [572, 6207.32] + - [644, 6207.32] - - [4096, 256, 1, 32] - - [576, 2892.72] + - [648, 2892.72] - - [4096, 256, 1, 1280] - - [585, 8392.9] + - [657, 8392.9] - - [512, 512, 1, 3200] - - [583, 6219.41] + - [655, 6219.41] - - [2048, 1024, 1, 1536] - - [482, 9052.55] + - [554, 9052.55] - - [2048, 256, 1, 1024] - - [572, 7192.9] + - [644, 7192.9] - - [128, 200, 1, 512] - - [562, 502.677] + - [634, 502.677] - - [4096, 512, 1, 7168] - - [485, 9329.11] + - [557, 9329.11] - - [1024, 512, 1, 1152] - - [572, 7358.53] + - [644, 7358.53] - - [64, 1024, 1, 2048] - - [490, 2102.51] + - [562, 2102.51] - - [2048, 512, 1, 3328] - - [574, 8694.69] + - [646, 8694.69] - - [4096, 1024, 1, 896] - - [480, 9343.02] + - [552, 9343.02] - - [1, 1024, 1, 2048] - - [530, 40.9324] + - [602, 40.9324] - - [4096, 200, 1, 3584] - - [578, 6810.3] + - [650, 6810.3] - - [4096, 1024, 1, 4096] - - [480, 9347.56] + - [552, 9347.56] - - [1024, 256, 1, 14336] - - [473, 6625.8] + - [545, 6625.8] - - [2048, 200, 1, 256] - - [572, 4413.3] + - [644, 4413.3] - - [4096, 256, 1, 16384] - - [474, 8752.13] + - [546, 8752.13] - - [4096, 256, 1, 1920] - - [565, 8533.78] + - [637, 8533.78] - - [32, 1024, 1, 512] - - [563, 647.369] + - [635, 647.369] - - [1024, 256, 1, 7680] - - [583, 6387.36] + - [655, 6387.36] - - [2048, 256, 1, 1664] - - [583, 7631.44] + - [655, 7631.44] - - [512, 200, 1, 1536] - - [488, 2576.88] + - [560, 2576.88] - - [2048, 1024, 1, 6144] - - [469, 9033.77] + - [541, 9033.77] - - [512, 256, 1, 2816] - - [535, 3977.46] + - [607, 3977.46] - - [4096, 512, 1, 4160] - - [482, 9289.02] + - [554, 9289.02] - - [4096, 512, 1, 2080] - - [561, 9150.28] + - [633, 9150.28] - - [2048, 256, 1, 15360] - - [546, 7963.97] + - [618, 7963.97] - - [4096, 200, 1, 5120] - - [585, 6861.62] + - [657, 6861.62] - - [1024, 512, 1, 8192] - - [571, 7473.25] + - [643, 7473.25] - - [4096, 200, 1, 896] - - [588, 6443.25] + - [660, 6443.25] - - [2048, 512, 1, 8320] - - [578, 8810.24] + - [650, 8810.24] - - [1024, 1024, 1, 10240] - - [586, 8436.7] + - [658, 8436.7] - - [1024, 200, 1, 768] - - [572, 4087.58] + - [644, 4087.58] - - [2048, 200, 1, 640] - - [583, 5416.3] + - [655, 5416.3] - - [512, 200, 1, 2048] - - [537, 2702.62] + - [609, 2702.62] - - [1024, 1024, 1, 9216] - - [575, 8499.08] + - [647, 8499.08] - - [4096, 200, 1, 1408] - - [585, 6613.82] + - [657, 6613.82] - - [1024, 256, 1, 13312] - - [473, 6643.54] + - [545, 6643.54] - - [1024, 256, 1, 128] - - [504, 2706.1] + - [576, 2706.1] - - [2048, 200, 1, 5632] - - [583, 6270.12] + - [655, 6270.12] - - [64, 1024, 1, 512] - - [562, 1310.82] + - [634, 1310.82] - - [1024, 512, 1, 2560] - - [583, 7731.54] + - [655, 7731.54] - - [4096, 200, 1, 1280] - - [565, 6566.83] + - [637, 6566.83] - - [1024, 200, 1, 4096] - - [583, 4911.46] + - [655, 4911.46] - - [1024, 1024, 1, 2560] - - [574, 8630.35] + - [646, 8630.35] - - [2048, 512, 1, 64] - - [578, 4152.88] + - [650, 4152.88] - - [2048, 200, 1, 8192] - - [572, 6234.21] + - [644, 6234.21] - - [2048, 512, 1, 3072] - - [582, 8614.85] + - [654, 8614.85] - - [4096, 1024, 1, 5120] - - [480, 9573.75] + - [552, 9573.75] - - [4096, 256, 1, 640] - - [567, 7913.88] + - [639, 7913.88] - - [1024, 256, 1, 1280] - - [572, 5706.64] + - [644, 5706.64] - - [2048, 1024, 1, 1920] - - [482, 9141.34] + - [554, 9141.34] - - [2048, 256, 1, 4096] - - [572, 7937.28] + - [644, 7937.28] - - [2048, 1024, 1, 15360] - - [485, 9351.96] + - [557, 9351.96] - - [4096, 200, 1, 16384] - - [474, 6975.21] + - [546, 6975.21] - - [1, 1024, 1, 4096] - - [592, 60.7815] + - [664, 60.7815] - - [4096, 1024, 1, 2816] - - [480, 9583.98] + - [552, 9583.98] - - [4096, 200, 1, 1664] - - [567, 6658.7] + - [639, 6658.7] - - [4096, 512, 1, 256] - - [498, 7731.54] + - [570, 7731.54] - - [1024, 200, 1, 896] - - [572, 4193.45] + - [644, 4193.45] - - [2048, 200, 1, 6656] - - [583, 6291.17] + - [655, 6291.17] - - [2048, 1024, 1, 5120] - - [482, 9270.57] + - [554, 9270.57] - - [512, 1024, 1, 768] - - [572, 7099.06] + - [644, 7099.06] - - [2048, 512, 1, 14336] - - [552, 8559.13] + - [624, 8559.13] - - [2048, 200, 1, 8320] - - [572, 6314.72] + - [644, 6314.72] - - [4096, 256, 1, 3840] - - [588, 8718.56] + - [660, 8718.56] - - [2048, 1024, 1, 4096] - - [469, 8973.38] + - [541, 8973.38] - - [1024, 1024, 1, 3200] - - [578, 8701.98] + - [650, 8701.98] - - [1024, 256, 1, 4608] - - [572, 6268.05] + - [644, 6268.05] - - [4096, 512, 1, 4608] - - [480, 9316.47] + - [552, 9316.47] - - [2048, 512, 1, 2048] - - [571, 8462.76] + - [643, 8462.76] - - [4096, 512, 1, 1664] - - [480, 9074.53] + - [552, 9074.53] - - [4096, 256, 1, 4608] - - [567, 8718.05] + - [639, 8718.05] - - [1024, 512, 1, 32] - - [560, 1807.99] + - [632, 1807.99] - - [1024, 512, 1, 3840] - - [572, 7936.34] + - [644, 7936.34] - - [2048, 512, 1, 1920] - - [588, 8548.27] + - [660, 8548.27] - - [2048, 1024, 1, 896] - - [480, 8843.51] + - [552, 8843.51] - - [4096, 200, 1, 6144] - - [588, 6864.76] + - [660, 6864.76] - - [1024, 512, 1, 13312] - - [541, 7763.19] + - [613, 7763.19] - - [4096, 1024, 1, 4160] - - [480, 9650.72] + - [552, 9650.72] - - [2048, 200, 1, 2816] - - [572, 6119.76] + - [644, 6119.76] - - [1024, 1024, 1, 3840] - - [567, 8709.5] + - [639, 8709.5] - - [128, 1024, 1, 1024] - - [590, 2577.25] + - [662, 2577.25] - - [2048, 1024, 1, 11264] - - [485, 9339.06] + - [557, 9339.06] - - [2048, 1024, 1, 384] - - [574, 8210.81] + - [646, 8210.81] - - [1024, 256, 1, 2048] - - [595, 5755.58] + - [667, 5755.58] - - [2048, 1024, 1, 3840] - - [482, 9288.96] + - [554, 9288.96] - - [4096, 256, 1, 8320] - - [588, 8812.38] + - [660, 8812.38] - - [2048, 256, 1, 3840] - - [564, 7857.05] + - [636, 7857.05] - - [64, 256, 1, 512] - - [562, 336.182] + - [634, 336.182] - - [4096, 512, 1, 1280] - - [482, 8993.52] + - [554, 8993.52] - - [512, 256, 1, 1280] - - [514, 2996.03] + - [586, 2996.03] - - [1024, 512, 1, 7680] - - [572, 8041.59] + - [644, 8041.59] - - [4096, 1024, 1, 1152] - - [480, 9368.48] + - [552, 9368.48] - - [256, 200, 1, 512] - - [514, 993.07] + - [586, 993.07] - - [256, 1024, 1, 2048] - - [591, 4759.59] + - [663, 4759.59] - - [2048, 200, 1, 10240] - - [583, 6329.03] + - [655, 6329.03] - - [2048, 512, 1, 5120] - - [584, 8732.56] + - [656, 8732.56] - - [2048, 1024, 1, 1408] - - [482, 9006.9] + - [554, 9006.9] - - [512, 1024, 1, 512] - - [572, 6528.2] + - [644, 6528.2] - - [1024, 200, 1, 11264] - - [539, 5194.82] + - [611, 5194.82] - - [512, 1024, 1, 1024] - - [525, 6337.1] + - [597, 6337.1] - - [2048, 512, 1, 32] - - [491, 2777.78] + - [563, 2777.78] - - [4096, 256, 1, 2560] - - [574, 8621.49] + - [646, 8621.49] - - [4096, 256, 1, 64] - - [508, 4194.4] + - [580, 4194.4] - - [32, 1024, 1, 1024] - - [509, 778.264] + - [581, 778.264] - - [2048, 200, 1, 768] - - [583, 5507.33] + - [655, 5507.33] - - [512, 512, 1, 2048] - - [531, 5338.91] + - [603, 5338.91] - - [2048, 512, 1, 2560] - - [585, 8643.69] + - [657, 8643.69] - - [512, 256, 1, 512] - - [564, 2542.1] + - [636, 2542.1] - - [1024, 200, 1, 7680] - - [539, 5047.8] + - [611, 5047.8] - - [4096, 512, 1, 896] - - [480, 8856.85] + - [552, 8856.85] - - [4096, 1024, 1, 3072] - - [480, 9492.17] + - [552, 9492.17] - - [4096, 200, 1, 13312] - - [472, 6900.73] + - [544, 6900.73] - - [2048, 512, 1, 7168] - - [573, 8788.1] + - [645, 8788.1] - - [2048, 1024, 1, 2816] - - [485, 9229.88] + - [557, 9229.88] - - [2048, 512, 1, 128] - - [503, 5630.04] + - [575, 5630.04] - - [1024, 256, 1, 8192] - - [595, 6203.83] + - [667, 6203.83] - - [4096, 1024, 1, 1792] - - [480, 9510.42] + - [552, 9510.42] - - [1024, 200, 1, 6656] - - [564, 5002.85] + - [636, 5002.85] - - [1024, 1024, 1, 1024] - - [501, 8095.26] + - [573, 8095.26] - - [4096, 200, 1, 2304] - - [585, 6754.45] + - [657, 6754.45] - - [4096, 512, 1, 1152] - - [480, 8974.54] + - [552, 8974.54] - - [512, 200, 1, 1024] - - [562, 2233.01] + - [634, 2233.01] - - [1024, 256, 1, 3840] - - [583, 6244.72] + - [655, 6244.72] - - [512, 512, 1, 768] - - [572, 5331.84] + - [644, 5331.84] - - [2048, 512, 1, 4096] - - [582, 8621.76] + - [654, 8621.76] - - [2048, 256, 1, 2560] - - [572, 7770.93] + - [644, 7770.93] - - [2048, 256, 1, 4160] - - [583, 7923.08] + - [655, 7923.08] - - [1024, 256, 1, 64] - - [479, 1705.1] + - [551, 1705.1] - - [4096, 512, 1, 7680] - - [480, 9364.57] + - [552, 9364.57] - - [1024, 512, 1, 1664] - - [583, 7594.24] + - [655, 7594.24] - - [2048, 512, 1, 2080] - - [574, 8570.67] + - [646, 8570.67] - - [2048, 512, 1, 3840] - - [585, 8729.14] + - [657, 8729.14] - - [4096, 1024, 1, 384] - - [480, 8764.86] + - [552, 8764.86] - - [4096, 200, 1, 3072] - - [574, 6772.39] + - [646, 6772.39] - - [1024, 512, 1, 14336] - - [542, 7680.97] + - [614, 7680.97] - - [1024, 200, 1, 1920] - - [564, 4637.08] + - [636, 4637.08] - - [1024, 1024, 1, 1664] - - [578, 8506.49] + - [650, 8506.49] - - [512, 1024, 1, 2304] - - [572, 7775.33] + - [644, 7775.33] - - [2048, 1024, 1, 1792] - - [480, 9123.46] + - [552, 9123.46] - - [32, 200, 1, 512] - - [580, 125.744] + - [652, 125.744] - - [4096, 256, 1, 11264] - - [585, 8822.31] + - [657, 8822.31] - - [4096, 256, 1, 1408] - - [585, 8419.32] + - [657, 8419.32] - - [1024, 256, 1, 7168] - - [572, 6377.54] + - [644, 6377.54] - - [2048, 256, 1, 1152] - - [583, 7401.81] + - [655, 7401.81] - - [256, 256, 1, 512] - - [562, 1314.93] + - [634, 1314.93] - - [1024, 512, 1, 1280] - - [572, 7410.53] + - [644, 7410.53] - - [512, 512, 1, 1792] - - [564, 5931.44] + - [636, 5931.44] - - [2048, 200, 1, 12288] - - [546, 6242.25] + - [618, 6242.25] - - [2048, 200, 1, 1664] - - [583, 5953.75] + - [655, 5953.75] - - [4096, 200, 1, 4608] - - [578, 6853.54] + - [650, 6853.54] - - [512, 1024, 1, 2560] - - [572, 7778.13] + - [644, 7778.13] - - [4096, 200, 1, 384] - - [565, 5765.73] + - [637, 5765.73] - - [128, 512, 1, 512] - - [562, 1302.68] + - [634, 1302.68] - - [1024, 200, 1, 256] - - [566, 2861.93] + - [638, 2861.93] - - [256, 1024, 1, 1024] - - [507, 4522.26] + - [579, 4522.26] - - [2048, 200, 1, 128] - - [572, 3310.0] + - [644, 3310.0] - - [2048, 200, 1, 11264] - - [553, 6168.2] + - [625, 6168.2] - - [1024, 512, 1, 1920] - - [583, 7649.29] + - [655, 7649.29] - - [4096, 256, 1, 1536] - - [578, 8427.33] + - [650, 8427.33] - - [4096, 1024, 1, 3584] - - [480, 9618.0] + - [552, 9618.0] - - [2048, 256, 1, 256] - - [572, 5464.99] + - [644, 5464.99] - - [2048, 1024, 1, 768] - - [480, 8726.87] + - [552, 8726.87] - - [4096, 256, 1, 10240] - - [574, 8790.89] + - [646, 8790.89] - - [2048, 256, 1, 10240] - - [554, 7665.31] + - [626, 7665.31] - - [4096, 200, 1, 14336] - - [588, 6916.18] + - [660, 6916.18] - - [1024, 512, 1, 5120] - - [526, 7420.36] + - [598, 7420.36] - - [1024, 512, 1, 8320] - - [583, 8061.31] + - [655, 8061.31] - - [256, 200, 1, 2048] - - [538, 1916.36] + - [610, 1916.36] - - [1024, 200, 1, 640] - - [566, 3873.39] + - [638, 3873.39] - - [1024, 512, 1, 10240] - - [571, 7526.9] + - [643, 7526.9] - - [1024, 200, 1, 4160] - - [583, 4928.19] + - [655, 4928.19] - - [1024, 200, 1, 5632] - - [564, 4978.66] + - [636, 4978.66] - - [1024, 1024, 1, 2048] - - [519, 7937.28] + - [591, 7937.28] - - [1024, 256, 1, 6656] - - [583, 6373.68] + - [655, 6373.68] - - [2048, 1024, 1, 8320] - - [480, 9333.15] + - [552, 9333.15] - - [1024, 256, 1, 10240] - - [572, 6407.29] + - [644, 6407.29] - - [2048, 256, 1, 2080] - - [572, 7714.58] + - [644, 7714.58] - - [4096, 256, 1, 128] - - [486, 5765.47] + - [558, 5765.47] - - [1024, 256, 1, 768] - - [577, 5210.42] + - [649, 5210.42] - - [2048, 256, 1, 896] - - [583, 7267.46] + - [655, 7267.46] - - [64, 512, 1, 2048] - - [549, 1296.64] + - [621, 1296.64] - - [4096, 512, 1, 2048] - - [482, 9121.25] + - [554, 9121.25] - - [512, 256, 1, 2048] - - [535, 3283.31] + - [607, 3283.31] - - [4096, 256, 1, 16640] - - [567, 8839.88] + - [639, 8839.88] - - [4096, 512, 1, 2560] - - [485, 9222.15] + - [557, 9222.15] - - [1024, 512, 1, 15360] - - [536, 7865.66] + - [608, 7865.66] - - [4096, 1024, 1, 2304] - - [480, 9558.26] + - [552, 9558.26] - - [4096, 200, 1, 1152] - - [585, 6531.93] + - [657, 6531.93] - - [2048, 200, 1, 6144] - - [583, 6277.75] + - [655, 6277.75] - - [1024, 1024, 1, 7680] - - [578, 8799.34] + - [650, 8799.34] - - [2048, 200, 1, 1920] - - [583, 6031.02] + - [655, 6031.02] - - [32, 1024, 1, 2048] - - [557, 1174.98] + - [629, 1174.98] - - [1024, 200, 1, 3584] - - [564, 4880.44] + - [636, 4880.44] - - [4096, 256, 1, 2080] - - [571, 8557.22] + - [643, 8557.22] - - [1024, 1024, 1, 16384] - - [472, 8618.65] + - [544, 8618.65] - - [1024, 256, 1, 1408] - - [583, 5803.54] + - [655, 5803.54] - - [1024, 256, 1, 4096] - - [593, 6037.78] + - [665, 6037.78] - - [2048, 200, 1, 14336] - - [583, 6364.48] + - [655, 6364.48] - - [4096, 512, 1, 5120] - - [482, 9302.05] + - [554, 9302.05] - - [1024, 512, 1, 6144] - - [518, 7469.09] + - [590, 7469.09] - - [1024, 512, 1, 2304] - - [583, 7759.35] + - [655, 7759.35] - - [4096, 200, 1, 4160] - - [567, 6843.22] + - [639, 6843.22] - - [4096, 200, 1, 1536] - - [578, 6628.27] + - [650, 6628.27] - - [4096, 1024, 1, 6144] - - [480, 9593.08] + - [552, 9593.08] - - [1280, 384, 1, 64] - - [611, 3196.98] + - [683, 3196.98] - - [256, 64, 1, 1225] - - [612, 1194.77] + - [684, 1194.77] - - [2048, 320, 1, 64] - - [614, 3449.36] + - [686, 3449.36] - - [256, 48, 1, 1225] - - [605, 913.498] + - [677, 913.498] - - [2048, 192, 1, 64] - - [604, 2516.68] + - [676, 2516.68] - - [1024, 128, 1, 289] - - [618, 2869.78] + - [690, 2869.78] - - [1280, 192, 1, 64] - - [597, 1872.56] + - [669, 1872.56] - - [192, 32, 1, 1225] - - [602, 505.906] + - [674, 505.906] - - [1280, 448, 1, 64] - - [598, 3078.97] + - [670, 3078.97] - - [384, 64, 1, 1225] - - [603, 1511.43] + - [675, 1511.43] - - [2048, 384, 1, 64] - - [616, 3836.35] + - [688, 3836.35] - - [288, 48, 1, 1225] - - [599, 1032.69] + - [671, 1032.69] - - [64, 80, 1, 5329] - - [615, 888.267] + - [687, 888.267] - - [1024, 384, 1, 289] - - [609, 4291.62] + - [681, 4291.62] - - [2048, 448, 1, 64] - - [608, 3783.62] + - [680, 3783.62] - - [1280, 320, 1, 64] - - [614, 2777.05] + - [686, 2777.05] - - [192, 64, 1, 1225] - - [599, 926.997] + - [671, 926.997] - - [384, 192, 1, 1225] - - [610, 2560.1] + - [682, 2560.1] - - [1536, 256, 1, 64] - - [617, 2621.54] + - [689, 2621.54] - - [192, 48, 1, 1225] - - [602, 698.714] + - [674, 698.714] - - [768, 128, 1, 289] - - [619, 2291.22] + - [691, 2291.22] - - [1024, 256, 1, 289] - - [617, 4064.46] + - [689, 4064.46] - - [768, 192, 1, 289] - - [613, 2690.43] + - [685, 2690.43] - - [1536, 384, 1, 64] - - [600, 3145.83] + - [672, 3145.83] - - [288, 64, 1, 1225] - - [602, 1142.77] + - [674, 1142.77] - - [1024, 192, 1, 289] - - [607, 3243.23] + - [679, 3243.23] - - [384, 96, 1, 1225] - - [620, 1844.81] + - [692, 1844.81] - - [160, 64, 1, 5329] - - [606, 1564.58] + - [678, 1564.58] - - [768, 160, 1, 289] - - [601, 2386.68] + - [673, 2386.68] - - [1024, 3392, 1, 4096] - - [646, 8503.02] + - [718, 8503.02] - - [1024, 3301, 1, 4096] - - [648, 8414.1] + - [720, 8414.1] - - [1024, 3443, 1, 4096] - - [635, 8536.59] + - [707, 8536.59] - - [132, 134, 480, 64] - - [673, 4149.27] + - [745, 4149.27] - - [162, 162, 400, 64] - - [661, 5539.73] + - [733, 5539.73] - - [4096, 3548, 1, 1024] - - [627, 9773.01] + - [699, 9773.01] - - [4096, 2977, 1, 1024] - - [628, 9574.43] + - [700, 9574.43] - - [132, 135, 480, 64] - - [673, 4167.51] + - [745, 4167.51] - - [1024, 2985, 1, 4096] - - [631, 9133.99] + - [703, 9133.99] - - [33708, 3681, 1, 1024] - - [628, 10033.8] + - [700, 10033.8] - - [4096, 3443, 1, 1024] - - [628, 9513.78] + - [700, 9513.78] - - [11, 11, 5456, 64] - - [670, 627.346] + - [742, 627.346] - - [1024, 3400, 1, 4096] - - [649, 8420.02] + - [721, 8420.02] - - [4096, 3995, 1, 1024] - - [627, 9693.87] + - [699, 9693.87] - - [4096, 3190, 1, 1024] - - [627, 9474.84] + - [699, 9474.84] - - [4096, 3594, 1, 1024] - - [628, 9315.83] + - [700, 9315.83] - - [159, 162, 400, 64] - - [660, 5429.98] + - [732, 5429.98] - - [1024, 3565, 1, 4096] - - [643, 8532.8] + - [715, 8532.8] - - [4096, 3422, 1, 1024] - - [628, 9459.24] + - [700, 9459.24] - - [1024, 3214, 1, 4096] - - [648, 8064.92] + - [720, 8064.92] - - [33708, 3584, 1, 1024] - - [629, 10129.0] + - [701, 10129.0] - - [33708, 3640, 1, 1024] - - [626, 9919.22] + - [698, 9919.22] - - [4096, 3263, 1, 1024] - - [626, 9699.35] + - [698, 9699.35] - - [4096, 3296, 1, 1024] - - [626, 9780.8] + - [698, 9780.8] - - [1024, 3557, 1, 4096] - - [647, 8526.89] + - [719, 8526.89] - - [4096, 3463, 1, 1024] - - [626, 9578.13] + - [698, 9578.13] - - [4096, 3528, 1, 1024] - - [626, 9739.92] + - [698, 9739.92] - - [14, 14, 4368, 64] - - [658, 991.276] + - [730, 991.276] - - [4096, 3226, 1, 1024] - - [626, 9587.19] + - [698, 9587.19] - - [4096, 3439, 1, 1024] - - [629, 9499.72] + - [701, 9499.72] - - [1024, 3523, 1, 4096] - - [649, 8393.58] + - [721, 8393.58] - - [1024, 3098, 1, 4096] - - [655, 7882.87] + - [727, 7882.87] - - [4096, 3121, 1, 1024] - - [626, 9296.23] + - [698, 9296.23] - - [33708, 3894, 1, 1024] - - [627, 9952.27] + - [699, 9952.27] - - [1024, 3548, 1, 4096] - - [633, 8432.45] + - [705, 8432.45] - - [1024, 3451, 1, 4096] - - [646, 8456.44] + - [718, 8456.44] - - [4096, 3353, 1, 1024] - - [628, 9289.08] + - [700, 9289.08] - - [4096, 3402, 1, 1024] - - [628, 9406.44] + - [700, 9406.44] - - [4096, 3939, 1, 1024] - - [626, 9549.59] + - [698, 9549.59] - - [133, 133, 480, 64] - - [673, 4124.31] + - [745, 4124.31] - - [1024, 3559, 1, 4096] - - [648, 8587.04] + - [720, 8587.04] - - [1024, 2977, 1, 4096] - - [631, 9084.59] + - [703, 9084.59] - - [1024, 3478, 1, 4096] - - [642, 8342.85] + - [714, 8342.85] - - [134, 134, 480, 64] - - [675, 4204.43] + - [747, 4204.43] - - [1024, 3368, 1, 4096] - - [648, 8277.43] + - [720, 8277.43] - - [4096, 4012, 1, 1024] - - [628, 9726.57] + - [700, 9726.57] - - [4096, 3486, 1, 1024] - - [626, 9639.71] + - [698, 9639.71] - - [1024, 3479, 1, 4096] - - [636, 8420.37] + - [708, 8420.37] - - [1024, 3505, 1, 4096] - - [648, 8310.66] + - [720, 8310.66] - - [4096, 3381, 1, 1024] - - [629, 9357.75] + - [701, 9357.75] - - [4096, 3430, 1, 1024] - - [626, 9482.36] + - [698, 9482.36] - - [1024, 3554, 1, 4096] - - [648, 8592.38] + - [720, 8592.38] - - [4096, 3271, 1, 1024] - - [626, 9715.41] + - [698, 9715.41] - - [1024, 3063, 1, 4096] - - [630, 9388.56] + - [702, 9388.56] - - [1024, 3209, 1, 4096] - - [648, 8212.74] + - [720, 8212.74] - - [4096, 3503, 1, 1024] - - [628, 9680.59] + - [700, 9680.59] - - [4096, 3344, 1, 1024] - - [626, 9268.55] + - [698, 9268.55] - - [1024, 3147, 1, 4096] - - [649, 8037.2] + - [721, 8037.2] - - [1024, 3322, 1, 4096] - - [647, 8356.32] + - [719, 8356.32] - - [1024, 3341, 1, 4096] - - [648, 8316.33] + - [720, 8316.33] - - [1024, 3516, 1, 4096] - - [630, 8397.12] + - [702, 8397.12] - - [102, 101, 624, 64] - - [661, 4709.59] + - [733, 4709.59] - - [1024, 3454, 1, 4096] - - [647, 8425.6] + - [719, 8425.6] - - [4096, 3969, 1, 1024] - - [628, 9640.15] + - [700, 9640.15] - - [4096, 3466, 1, 1024] - - [628, 9576.83] + - [700, 9576.83] - - [1024, 3999, 1, 1024] - - [631, 9207.15] + - [703, 9207.15] - - [1024, 4032, 1, 1024] - - [632, 9294.56] + - [704, 9294.56] - - [1024, 3403, 1, 4096] - - [646, 8357.97] + - [718, 8357.97] - - [4096, 3361, 1, 1024] - - [628, 9308.78] + - [700, 9308.78] - - [1024, 3527, 1, 4096] - - [647, 8512.19] + - [719, 8512.19] - - [1024, 3822, 1, 4096] - - [631, 8991.13] + - [703, 8991.13] - - [4096, 3315, 1, 1024] - - [626, 9834.96] + - [698, 9834.96] - - [232, 232, 272, 64] - - [660, 6481.62] + - [732, 6481.62] - - [1024, 3336, 1, 4096] - - [649, 8295.61] + - [721, 8295.61] - - [228, 232, 272, 64] - - [661, 6327.85] + - [733, 6327.85] - - [4096, 3547, 1, 1024] - - [626, 9781.56] + - [698, 9781.56] - - [4096, 3340, 1, 1024] - - [628, 9269.72] + - [700, 9269.72] - - [1024, 3906, 1, 1024] - - [632, 9018.38] + - [704, 9018.38] - - [1024, 3295, 1, 4096] - - [646, 8194.83] + - [718, 8194.83] - - [4096, 3294, 1, 1024] - - [629, 9762.16] + - [701, 9762.16] - - [33708, 3968, 1, 1024] - - [629, 10147.8] + - [701, 10147.8] - - [1024, 3473, 1, 4096] - - [635, 8318.68] + - [707, 8318.68] - - [1024, 3072, 1, 4096] - - [632, 9370.13] + - [704, 9370.13] - - [4096, 3189, 1, 1024] - - [626, 9470.26] + - [698, 9470.26] - - [4096, 3494, 1, 1024] - - [626, 9661.32] + - [698, 9661.32] - - [1024, 3522, 1, 4096] - - [649, 8459.23] + - [721, 8459.23] - - [33708, 3944, 1, 1024] - - [629, 10060.2] + - [701, 10060.2] - - [135, 135, 480, 64] - - [674, 4257.03] + - [746, 4257.03] - - [4096, 3421, 1, 1024] - - [626, 9456.98] + - [698, 9456.98] - - [32, 32, 1984, 64] - - [671, 3436.24] + - [743, 3436.24] - - [4096, 3311, 1, 1024] - - [626, 9810.88] + - [698, 9810.88] - - [1024, 3990, 1, 1024] - - [633, 9197.74] + - [705, 9197.74] - - [1024, 3290, 1, 4096] - - [646, 8229.63] + - [718, 8229.63] - - [4096, 3565, 1, 1024] - - [627, 9824.48] + - [699, 9824.48] - - [1024, 3484, 1, 4096] - - [636, 8575.38] + - [708, 8575.38] - - [4096, 3384, 1, 1024] - - [626, 9366.54] + - [698, 9366.54] - - [1024, 3422, 1, 4096] - - [646, 8484.12] + - [718, 8484.12] - - [4096, 3681, 1, 1024] - - [627, 9520.16] + - [699, 9520.16] - - [1024, 3584, 1, 1024] - - [653, 8583.37] + - [725, 8583.37] - - [4096, 4050, 1, 1024] - - [628, 9807.35] + - [700, 9807.35] - - [1024, 3996, 1, 4096] - - [629, 9181.7] + - [701, 9181.7] - - [4096, 3169, 1, 1024] - - [627, 9411.4] + - [699, 9411.4] - - [4096, 3538, 1, 1024] - - [627, 9765.99] + - [699, 9765.99] - - [1024, 3495, 1, 4096] - - [633, 8295.95] + - [705, 8295.95] - - [4096, 3401, 1, 1024] - - [626, 9402.68] + - [698, 9402.68] - - [1024, 3560, 1, 4096] - - [647, 8513.45] + - [719, 8513.45] - - [133, 135, 480, 64] - - [674, 4199.08] + - [746, 4199.08] - - [1024, 3263, 1, 4096] - - [648, 8172.23] + - [720, 8172.23] - - [1024, 3870, 1, 4096] - - [628, 8996.27] + - [700, 8996.27] - - [4096, 3555, 1, 1024] - - [629, 9811.88] + - [701, 9811.88] - - [4096, 3412, 1, 1024] - - [626, 9432.09] + - [698, 9432.09] - - [101, 101, 624, 64] - - [660, 4667.69] + - [732, 4667.69] - - [1024, 3296, 1, 4096] - - [647, 8350.61] + - [719, 8350.61] - - [1024, 3379, 1, 4096] - - [649, 8432.94] + - [721, 8432.94] - - [4096, 3302, 1, 1024] - - [626, 9796.39] + - [698, 9796.39] - - [1024, 3490, 1, 4096] - - [646, 8538.44] + - [718, 8538.44] - - [1024, 3428, 1, 4096] - - [647, 8531.67] + - [719, 8531.67] - - [1024, 3976, 1, 4096] - - [628, 9327.87] + - [700, 9327.87] - - [4096, 3485, 1, 1024] - - [626, 9628.82] + - [698, 9628.82] - - [4096, 3534, 1, 1024] - - [626, 9755.97] + - [698, 9755.97] - - [1024, 3064, 1, 4096] - - [632, 9196.98] + - [704, 9196.98] - - [4096, 3216, 1, 1024] - - [628, 9563.44] + - [700, 9563.44] - - [1024, 3450, 1, 4096] - - [656, 8519.29] + - [728, 8519.29] - - [1024, 3533, 1, 4096] - - [647, 8495.77] + - [719, 8495.77] - - [1024, 4030, 1, 1024] - - [632, 9304.68] + - [704, 9304.68] - - [1024, 3311, 1, 4096] - - [647, 8278.6] + - [719, 8278.6] - - [1024, 3468, 1, 4096] - - [638, 8564.55] + - [710, 8564.55] - - [23, 23, 2720, 64] - - [662, 2311.55] + - [734, 2311.55] - - [4096, 3359, 1, 1024] - - [628, 9309.15] + - [700, 9309.15] - - [4096, 3392, 1, 1024] - - [628, 9388.19] + - [700, 9388.19] - - [1024, 3925, 1, 1024] - - [630, 9006.72] + - [702, 9006.72] - - [4096, 3233, 1, 1024] - - [626, 9603.64] + - [698, 9603.64] - - [4096, 3956, 1, 1024] - - [627, 9581.94] + - [699, 9581.94] - - [1024, 3463, 1, 4096] - - [648, 8293.97] + - [720, 8293.97] - - [1024, 3126, 1, 4096] - - [647, 7978.13] + - [719, 7978.13] - - [1024, 3363, 1, 4096] - - [640, 8267.47] + - [712, 8267.47] - - [4096, 3465, 1, 1024] - - [626, 9590.74] + - [698, 9590.74] - - [33708, 3996, 1, 1024] - - [627, 9899.99] + - [699, 9899.99] - - [1024, 3231, 1, 4096] - - [648, 8231.68] + - [720, 8231.68] - - [33708, 3978, 1, 1024] - - [627, 9853.64] + - [699, 9853.64] - - [4096, 3476, 1, 1024] - - [626, 9616.62] + - [698, 9616.62] - - [85, 85, 752, 64] - - [658, 4240.65] + - [730, 4240.65] - - [4096, 3339, 1, 1024] - - [628, 9249.81] + - [700, 9249.81] - - [4096, 3452, 1, 1024] - - [626, 9534.13] + - [698, 9534.13] - - [1024, 3396, 1, 4096] - - [647, 8451.23] + - [719, 8451.23] - - [4096, 3293, 1, 1024] - - [628, 9775.22] + - [700, 9775.22] - - [54, 54, 1184, 64] - - [660, 4153.54] + - [732, 4153.54] - - [1024, 3432, 1, 4096] - - [641, 8345.53] + - [713, 8345.53] - - [4096, 3493, 1, 1024] - - [629, 9649.9] + - [701, 9649.9] - - [4096, 3350, 1, 1024] - - [628, 9273.91] + - [700, 9273.91] - - [1024, 3079, 1, 4096] - - [656, 7775.66] + - [728, 7775.66] - - [1024, 3101, 1, 4096] - - [656, 7847.85] + - [728, 7847.85] - - [33708, 3939, 1, 1024] - - [629, 10054.4] + - [701, 10054.4] - - [4096, 3256, 1, 1024] - - [628, 9681.83] + - [700, 9681.83] - - [1024, 3439, 1, 4096] - - [647, 8531.11] + - [719, 8531.11] - - [1024, 3510, 1, 4096] - - [646, 8422.31] + - [718, 8422.31] - - [4096, 3900, 1, 1024] - - [627, 9468.61] + - [699, 9468.61] - - [1024, 3470, 1, 4096] - - [648, 8507.77] + - [720, 8507.77] - - [4096, 3456, 1, 1024] - - [628, 9577.46] + - [700, 9577.46] - - [4096, 3014, 1, 1024] - - [627, 9666.15] + - [699, 9666.15] - - [4096, 3367, 1, 1024] - - [629, 9328.36] + - [701, 9328.36] - - [4096, 3432, 1, 1024] - - [626, 9480.88] + - [698, 9480.88] - - [33708, 4026, 1, 1024] - - [629, 9972.83] + - [701, 9972.83] - - [4096, 3273, 1, 1024] - - [626, 9716.95] + - [698, 9716.95] - - [4096, 3130, 1, 1024] - - [626, 9311.4] + - [698, 9311.4] - - [1024, 3496, 1, 4096] - - [637, 8434.65] + - [709, 8434.65] - - [1024, 3995, 1, 4096] - - [622, 9157.73] + - [694, 9157.73] - - [1024, 3939, 1, 4096] - - [630, 9059.86] + - [702, 9059.86] - - [1024, 3121, 1, 4096] - - [654, 7963.43] + - [726, 7963.43] - - [1024, 3232, 1, 4096] - - [648, 8061.09] + - [720, 8061.09] - - [4096, 3147, 1, 1024] - - [628, 9364.63] + - [700, 9364.63] - - [4096, 3516, 1, 1024] - - [626, 9708.84] + - [698, 9708.84] - - [1024, 3969, 1, 1024] - - [632, 9168.68] + - [704, 9168.68] - - [1024, 3364, 1, 4096] - - [636, 8363.65] + - [708, 8363.65] - - [4096, 3411, 1, 1024] - - [629, 9442.77] + - [701, 9442.77] - - [147, 147, 432, 64] - - [673, 4843.21] + - [745, 4843.21] - - [4096, 3301, 1, 1024] - - [628, 9783.46] + - [700, 9783.46] - - [112, 111, 576, 64] - - [660, 5627.47] + - [732, 5627.47] - - [1024, 3513, 1, 4096] - - [647, 8725.41] + - [719, 8725.41] - - [1024, 3469, 1, 4096] - - [627, 8183.11] + - [699, 8183.11] - - [1024, 3095, 1, 4096] - - [648, 7887.87] + - [720, 7887.87] - - [4096, 3533, 1, 1024] - - [627, 9755.27] + - [699, 9755.27] - - [4096, 3390, 1, 1024] - - [626, 9377.21] + - [698, 9377.21] - - [4096, 3582, 1, 1024] - - [626, 9874.96] + - [698, 9874.96] - - [1024, 3956, 1, 1024] - - [632, 9058.82] + - [704, 9058.82] - - [4096, 3585, 1, 1024] - - [628, 9289.75] + - [700, 9289.75] - - [4096, 3231, 1, 1024] - - [627, 9597.15] + - [699, 9597.15] - - [1024, 3205, 1, 4096] - - [646, 8073.25] + - [718, 8073.25] - - [4096, 3496, 1, 1024] - - [627, 9668.38] + - [699, 9668.38] - - [1024, 3143, 1, 4096] - - [646, 8031.68] + - [718, 8031.68] - - [1024, 3318, 1, 4096] - - [643, 8261.43] + - [715, 8261.43] - - [1024, 3353, 1, 4096] - - [647, 8414.92] + - [719, 8414.92] - - [1024, 3464, 1, 4096] - - [646, 8310.03] + - [718, 8310.03] - - [4096, 2736, 1, 1024] - - [628, 9563.12] + - [700, 9563.12] - - [1024, 3402, 1, 4096] - - [643, 8413.84] + - [715, 8413.84] - - [4096, 3138, 1, 1024] - - [628, 9342.09] + - [700, 9342.09] - - [1024, 3860, 1, 4096] - - [631, 9008.57] + - [703, 9008.57] - - [148, 148, 432, 64] - - [673, 4915.7] + - [745, 4915.7] - - [1024, 3539, 1, 4096] - - [643, 8449.36] + - [715, 8449.36] - - [4096, 3211, 1, 1024] - - [628, 9551.28] + - [700, 9551.28] - - [1024, 3332, 1, 4096] - - [636, 8295.11] + - [708, 8295.11] - - [1024, 3466, 1, 4096] - - [647, 8339.25] + - [719, 8339.25] - - [4096, 3475, 1, 1024] - - [626, 9612.33] + - [698, 9612.33] - - [4096, 3524, 1, 1024] - - [629, 9722.74] + - [701, 9722.74] - - [4096, 2985, 1, 1024] - - [629, 9591.33] + - [701, 9591.33] - - [4096, 3222, 1, 1024] - - [626, 9577.48] + - [698, 9577.48] - - [4096, 3451, 1, 1024] - - [628, 9541.42] + - [700, 9541.42] - - [1024, 3181, 1, 4096] - - [646, 8118.89] + - [718, 8118.89] - - [1024, 3640, 1, 4096] - - [631, 8617.11] + - [703, 8617.11] - - [1024, 3375, 1, 4096] - - [635, 8419.75] + - [707, 8419.75] - - [1024, 3550, 1, 4096] - - [648, 8512.83] + - [720, 8512.83] - - [1024, 4020, 1, 1024] - - [632, 9266.9] + - [704, 9266.9] - - [1024, 3840, 1, 4096] - - [631, 8983.49] + - [703, 8983.49] - - [4096, 3349, 1, 1024] - - [626, 9279.96] + - [698, 9279.96] - - [4096, 3398, 1, 1024] - - [627, 9402.32] + - [699, 9402.32] - - [33708, 3976, 1, 1024] - - [628, 9849.54] + - [700, 9849.54] - - [1024, 2917, 1, 4096] - - [633, 8936.87] + - [705, 8936.87] - - [33708, 3910, 1, 1024] - - [626, 9983.35] + - [698, 9983.35] - - [4096, 3860, 1, 1024] - - [627, 9377.58] + - [699, 9377.58] - - [4096, 3304, 1, 1024] - - [629, 9798.44] + - [701, 9798.44] - - [1024, 3286, 1, 4096] - - [634, 8167.41] + - [706, 8167.41] - - [1024, 3460, 1, 4096] - - [644, 8539.56] + - [716, 8539.56] - - [1024, 4026, 1, 4096] - - [630, 9305.68] + - [702, 9305.68] - - [4096, 3471, 1, 1024] - - [628, 9596.71] + - [700, 9596.71] - - [193, 193, 320, 64] - - [676, 4758.46] + - [748, 4758.46] - - [1024, 3894, 1, 1024] - - [630, 8979.6] + - [702, 8979.6] - - [65, 65, 992, 64] - - [672, 2565.49] + - [744, 2565.49] - - [1024, 3506, 1, 4096] - - [644, 8593.22] + - [716, 8593.22] - - [35, 35, 1808, 64] - - [666, 2129.72] + - [738, 2129.72] - - [1024, 4000, 1, 1024] - - [630, 9204.6] + - [702, 9204.6] - - [1024, 3900, 1, 4096] - - [626, 9050.36] + - [698, 9050.36] - - [1024, 3445, 1, 4096] - - [649, 8551.65] + - [721, 8551.65] - - [4096, 3442, 1, 1024] - - [627, 9505.0] + - [699, 9505.0] - - [1024, 3358, 1, 4096] - - [648, 8437.16] + - [720, 8437.16] - - [13, 13, 4672, 64] - - [659, 860.665] + - [731, 860.665] - - [1024, 3211, 1, 4096] - - [652, 8085.25] + - [724, 8085.25] - - [4096, 3515, 1, 1024] - - [628, 9715.29] + - [700, 9715.29] - - [1024, 3564, 1, 4096] - - [634, 8760.37] + - [706, 8760.37] - - [4096, 3057, 1, 1024] - - [628, 9804.05] + - [700, 9804.05] - - [1024, 3343, 1, 4096] - - [646, 8363.8] + - [718, 8363.8] - - [4096, 3262, 1, 1024] - - [627, 9686.49] + - [699, 9686.49] - - [1024, 3518, 1, 4096] - - [646, 8455.05] + - [718, 8455.05] - - [77, 77, 816, 64] - - [665, 3505.94] + - [737, 3505.94] - - [33708, 3876, 1, 1024] - - [627, 9895.95] + - [699, 9895.95] - - [4096, 3462, 1, 1024] - - [628, 9570.31] + - [700, 9570.31] - - [1024, 3265, 1, 4096] - - [646, 8322.75] + - [718, 8322.75] - - [4096, 3389, 1, 1024] - - [627, 9382.86] + - [699, 9382.86] - - [4096, 3438, 1, 1024] - - [628, 9503.47] + - [700, 9503.47] - - [1024, 3955, 1, 1024] - - [630, 9064.45] + - [702, 9064.45] - - [1024, 3545, 1, 4096] - - [649, 8652.41] + - [721, 8652.41] - - [1024, 3144, 1, 4096] - - [649, 8060.55] + - [721, 8060.55] - - [1024, 3417, 1, 4096] - - [647, 8505.91] + - [719, 8505.91] - - [4096, 3543, 1, 1024] - - [626, 9775.67] + - [698, 9775.67] - - [4096, 3352, 1, 1024] - - [628, 9282.87] + - [700, 9282.87] - - [33708, 3975, 1, 1024] - - [629, 9849.49] + - [701, 9849.49] - - [148, 147, 432, 64] - - [673, 4876.15] + - [745, 4876.15] - - [4096, 3137, 1, 1024] - - [626, 9330.63] + - [698, 9330.63] - - [4096, 3506, 1, 1024] - - [629, 9682.76] + - [701, 9682.76] - - [1024, 3975, 1, 1024] - - [632, 9164.77] + - [704, 9164.77] - - [1024, 3859, 1, 4096] - - [630, 8983.84] + - [702, 8983.84] - - [4096, 3369, 1, 1024] - - [628, 9330.45] + - [700, 9330.45] - - [1024, 3434, 1, 4096] - - [646, 8486.98] + - [718, 8486.98] - - [1024, 3292, 1, 4096] - - [646, 8478.96] + - [718, 8478.96] - - [4096, 3523, 1, 1024] - - [626, 9734.83] + - [698, 9734.83] - - [4096, 3380, 1, 1024] - - [628, 9354.49] + - [700, 9354.49] - - [1024, 3408, 1, 4096] - - [649, 8441.03] + - [721, 8441.03] - - [4096, 3221, 1, 1024] - - [628, 9575.59] + - [700, 9575.59] - - [4096, 3270, 1, 1024] - - [628, 9717.95] + - [700, 9717.95] - - [143, 143, 432, 64] - - [674, 4643.45] + - [746, 4643.45] - - [111, 111, 576, 64] - - [666, 5475.04] + - [738, 5475.04] - - [1024, 3303, 1, 4096] - - [648, 8413.07] + - [720, 8413.07] - - [4096, 3502, 1, 1024] - - [628, 9679.87] + - [700, 9679.87] - - [1024, 3222, 1, 4096] - - [648, 8141.88] + - [720, 8141.88] - - [4096, 2505, 1, 1024] - - [626, 9594.95] + - [698, 9594.95] - - [4096, 3397, 1, 1024] - - [626, 9392.61] + - [698, 9392.61] - - [4096, 3562, 1, 1024] - - [626, 9827.58] + - [698, 9827.58] - - [4096, 3095, 1, 1024] - - [628, 9222.45] + - [700, 9222.45] - - [1024, 3226, 1, 4096] - - [644, 8027.03] + - [716, 8027.03] - - [177, 177, 352, 64] - - [661, 6406.96] + - [733, 6406.96] - - [4096, 3360, 1, 1024] - - [627, 9298.15] + - [699, 9298.15] - - [1024, 3942, 1, 1024] - - [632, 9061.59] + - [704, 9061.59] - - [1024, 3298, 1, 4096] - - [649, 8254.36] + - [721, 8254.36] - - [1024, 3381, 1, 4096] - - [648, 8508.81] + - [720, 8508.81] - - [4096, 3314, 1, 1024] - - [628, 9837.56] + - [700, 9837.56] - - [1024, 3492, 1, 4096] - - [636, 8583.39] + - [708, 8583.39] - - [1024, 3430, 1, 4096] - - [636, 8492.71] + - [708, 8492.71] - - [4096, 3977, 1, 1024] - - [628, 9656.45] + - [700, 9656.45] - - [4096, 3546, 1, 1024] - - [626, 9780.35] + - [698, 9780.35] - - [4096, 3640, 1, 1024] - - [626, 9415.51] + - [698, 9415.51] - - [4096, 3441, 1, 1024] - - [627, 9499.24] + - [699, 9499.24] - - [33708, 4059, 1, 1024] - - [629, 10051.9] + - [701, 10051.9] - - [1024, 3978, 1, 1024] - - [630, 9158.8] + - [702, 9158.8] - - [1024, 3376, 1, 4096] - - [648, 8415.44] + - [720, 8415.44] - - [1024, 3482, 1, 4096] - - [649, 8396.62] + - [721, 8396.62] - - [1024, 3563, 1, 4096] - - [632, 8424.18] + - [704, 8424.18] - - [4096, 4020, 1, 1024] - - [629, 9745.96] + - [701, 9745.96] - - [1024, 3271, 1, 4096] - - [647, 8289.68] + - [719, 8289.68] - - [1024, 3291, 1, 4096] - - [647, 8222.71] + - [719, 8222.71] - - [1024, 3431, 1, 4096] - - [642, 8464.4] + - [714, 8464.4] - - [1024, 3481, 1, 4096] - - [648, 8386.5] + - [720, 8386.5] - - [84, 85, 752, 64] - - [663, 4194.85] + - [735, 4194.85] - - [4096, 3461, 1, 1024] - - [626, 9579.67] + - [698, 9579.67] - - [1024, 3574, 1, 4096] - - [649, 8579.8] + - [721, 8579.8] - - [1024, 4059, 1, 1024] - - [630, 9330.54] + - [702, 9330.54] - - [84, 84, 752, 64] - - [670, 4141.46] + - [742, 4141.46] - - [1024, 3421, 1, 4096] - - [649, 8528.42] + - [721, 8528.42] - - [4096, 3224, 1, 1024] - - [628, 9589.95] + - [700, 9589.95] - - [4096, 3437, 1, 1024] - - [628, 9498.2] + - [700, 9498.2] - - [45, 45, 1424, 64] - - [660, 3314.58] + - [732, 3314.58] - - [4096, 3840, 1, 1024] - - [626, 9931.37] + - [698, 9931.37] - - [4096, 3168, 1, 1024] - - [628, 9412.16] + - [700, 9412.16] - - [33708, 3990, 1, 1024] - - [626, 9884.39] + - [698, 9884.39] - - [1024, 3349, 1, 4096] - - [648, 8421.4] + - [720, 8421.4] - - [4096, 3335, 1, 1024] - - [626, 9241.65] + - [698, 9241.65] - - [4096, 3400, 1, 1024] - - [628, 9407.35] + - [700, 9407.35] - - [160, 159, 400, 64] - - [675, 5708.94] + - [747, 5708.94] - - [1024, 3398, 1, 4096] - - [648, 8624.03] + - [720, 8624.03] - - [1024, 3780, 1, 4096] - - [628, 8756.78] + - [700, 8756.78] - - [29, 29, 2176, 64] - - [671, 2963.69] + - [743, 2963.69] - - [4096, 3098, 1, 1024] - - [626, 9229.82] + - [698, 9229.82] - - [1024, 4012, 1, 4096] - - [632, 9422.03] + - [704, 9422.03] - - [4096, 3505, 1, 1024] - - [628, 9687.65] + - [700, 9687.65] - - [4096, 3554, 1, 1024] - - [628, 9812.22] + - [700, 9812.22] - - [4096, 3063, 1, 1024] - - [628, 9825.1] + - [700, 9825.1] - - [1024, 3503, 1, 4096] - - [646, 8404.74] + - [718, 8404.74] - - [1024, 3166, 1, 4096] - - [649, 8084.93] + - [721, 8084.93] - - [1024, 3425, 1, 4096] - - [649, 8537.58] + - [721, 8537.58] - - [1024, 3344, 1, 4096] - - [640, 8351.16] + - [712, 8351.16] - - [4096, 3484, 1, 1024] - - [628, 9635.7] + - [700, 9635.7] - - [1024, 3681, 1, 1024] - - [631, 8457.18] + - [703, 8457.18] - - [1024, 4050, 1, 1024] - - [632, 9326.21] + - [704, 9326.21] - - [4096, 3379, 1, 1024] - - [626, 9356.16] + - [698, 9356.16] - - [4096, 3428, 1, 1024] - - [627, 9472.33] + - [699, 9472.33] - - [12, 12, 5040, 64] - - [665, 741.617] + - [737, 741.617] - - [27, 27, 2336, 64] - - [671, 2757.9] + - [743, 2757.9] - - [1024, 3304, 1, 4096] - - [649, 8317.82] + - [721, 8317.82] - - [1024, 3387, 1, 4096] - - [647, 8460.15] + - [719, 8460.15] - - [4096, 3126, 1, 1024] - - [629, 9308.48] + - [701, 9308.48] - - [1024, 3498, 1, 4096] - - [646, 8485.55] + - [718, 8485.55] - - [1024, 3436, 1, 4096] - - [648, 8397.71] + - [720, 8397.71] - - [4096, 3501, 1, 1024] - - [626, 9681.19] + - [698, 9681.19] - - [4096, 3358, 1, 1024] - - [628, 9304.9] + - [700, 9304.9] - - [4096, 3232, 1, 1024] - - [626, 9607.2] + - [698, 9607.2] - - [1024, 3585, 1, 4096] - - [630, 8510.74] + - [702, 8510.74] - - [4096, 3143, 1, 1024] - - [629, 9355.91] + - [701, 9355.91] - - [4096, 3464, 1, 1024] - - [628, 9585.95] + - [700, 9585.95] - - [1024, 3366, 1, 4096] - - [636, 8275.23] + - [708, 8275.23] - - [4096, 3375, 1, 1024] - - [626, 9342.13] + - [698, 9342.13] - - [4096, 2917, 1, 1024] - - [626, 9372.84] + - [698, 9372.84] - - [4096, 4026, 1, 1024] - - [628, 9759.15] + - [700, 9759.15] - - [49, 49, 1296, 64] - - [667, 3710.02] + - [739, 3710.02] - - [1024, 3277, 1, 4096] - - [647, 8217.1] + - [719, 8217.1] - - [1024, 3103, 1, 4096] - - [648, 7872.67] + - [720, 7872.67] - - [33708, 3995, 1, 1024] - - [628, 9893.08] + - [700, 9893.08] - - [1024, 3297, 1, 4096] - - [647, 8185.82] + - [719, 8185.82] - - [4096, 3545, 1, 1024] - - [628, 9789.43] + - [700, 9789.43] - - [1024, 3399, 1, 4096] - - [647, 8377.18] + - [719, 8377.18] - - [33708, 3796, 1, 1024] - - [627, 10008.0] + - [699, 10008.0] - - [4096, 3292, 1, 1024] - - [628, 9767.28] + - [700, 9767.28] - - [71, 71, 896, 64] - - [662, 3006.25] + - [734, 3006.25] - - [33708, 3859, 1, 1024] - - [629, 9860.37] + - [701, 9860.37] - - [4096, 3566, 1, 1024] - - [628, 9834.47] + - [700, 9834.47] - - [4096, 3894, 1, 1024] - - [626, 9456.67] + - [698, 9456.67] - - [4096, 3492, 1, 1024] - - [626, 9653.24] + - [698, 9653.24] - - [1024, 3977, 1, 1024] - - [632, 9161.33] + - [704, 9161.33] - - [1024, 3272, 1, 4096] - - [649, 8257.09] + - [721, 8257.09] - - [135, 134, 480, 64] - - [673, 4238.39] + - [745, 4238.39] - - [1024, 3355, 1, 4096] - - [647, 8374.64] + - [719, 8374.64] - - [4096, 3419, 1, 1024] - - [629, 9455.44] + - [701, 9455.44] - - [1024, 3404, 1, 4096] - - [648, 8580.28] + - [720, 8580.28] - - [4096, 3999, 1, 1024] - - [628, 9701.78] + - [700, 9701.78] - - [4096, 3166, 1, 1024] - - [626, 9410.48] + - [698, 9410.48] - - [33708, 3840, 1, 1024] - - [629, 10132.9] + - [701, 10132.9] - - [4096, 4032, 1, 1024] - - [629, 9762.86] + - [701, 9762.86] - - [1024, 3573, 1, 4096] - - [647, 8603.4] + - [719, 8603.4] - - [4096, 3366, 1, 1024] - - [629, 9322.63] + - [701, 9322.63] - - [1024, 3541, 1, 4096] - - [649, 8405.9] + - [721, 8405.9] - - [4096, 3207, 1, 1024] - - [626, 9544.25] + - [698, 9544.25] - - [4096, 3272, 1, 1024] - - [628, 9716.73] + - [700, 9716.73] - - [1024, 3334, 1, 4096] - - [646, 8241.39] + - [718, 8241.39] - - [228, 228, 272, 64] - - [661, 6232.45] + - [733, 6232.45] - - [4096, 3183, 1, 1024] - - [628, 9452.44] + - [700, 9452.44] - - [4096, 3536, 1, 1024] - - [627, 9759.44] + - [699, 9759.44] - - [1024, 4005, 1, 1024] - - [631, 9225.83] + - [703, 9225.83] - - [1024, 3245, 1, 4096] - - [648, 8074.31] + - [720, 8074.31] - - [4096, 3447, 1, 1024] - - [627, 9525.84] + - [699, 9525.84] - - [1024, 3183, 1, 4096] - - [647, 8121.62] + - [719, 8121.62] - - [1024, 3361, 1, 4096] - - [649, 8285.86] + - [721, 8285.86] - - [33708, 3870, 1, 1024] - - [627, 9879.35] + - [699, 9879.35] - - [1024, 3321, 1, 4096] - - [648, 8408.67] + - [720, 8408.67] - - [1024, 3968, 1, 1024] - - [630, 9202.05] + - [702, 9202.05] - - [1024, 3486, 1, 4096] - - [644, 8258.89] + - [716, 8258.89] - - [4096, 4005, 1, 1024] - - [628, 9723.98] + - [700, 9723.98] - - [4096, 3410, 1, 1024] - - [629, 9440.5] + - [701, 9440.5] - - [1024, 3944, 1, 1024] - - [632, 9040.82] + - [704, 9040.82] - - [4096, 3300, 1, 1024] - - [627, 9789.9] + - [699, 9789.9] - - [4096, 3579, 1, 1024] - - [629, 9859.44] + - [701, 9859.44] - - [4096, 3483, 1, 1024] - - [629, 9624.31] + - [701, 9624.31] - - [4096, 3532, 1, 1024] - - [628, 9742.76] + - [700, 9742.76] - - [1024, 3140, 1, 4096] - - [648, 7899.65] + - [720, 7899.65] - - [1024, 3372, 1, 4096] - - [646, 8237.07] + - [718, 8237.07] - - [1024, 3224, 1, 4096] - - [649, 8159.13] + - [721, 8159.13] - - [4096, 3230, 1, 1024] - - [628, 9601.25] + - [700, 9601.25] - - [4096, 3427, 1, 1024] - - [628, 9466.57] + - [700, 9466.57] - - [1024, 3796, 1, 1024] - - [632, 8739.78] + - [704, 8739.78] - - [143, 148, 432, 64] - - [673, 4762.0] + - [745, 4762.0] - - [1024, 3616, 1, 4096] - - [631, 8445.89] + - [703, 8445.89] - - [1024, 3315, 1, 4096] - - [648, 8403.21] + - [720, 8403.21] - - [1024, 3476, 1, 4096] - - [646, 8523.68] + - [718, 8523.68] - - [1024, 3509, 1, 4096] - - [646, 8345.05] + - [718, 8345.05] - - [4096, 3357, 1, 1024] - - [628, 9300.16] + - [700, 9300.16] - - [4096, 3406, 1, 1024] - - [628, 9427.44] + - [700, 9427.44] - - [1024, 3558, 1, 4096] - - [647, 8525.78] + - [719, 8525.78] - - [4096, 3593, 1, 1024] - - [628, 9302.2] + - [700, 9302.2] - - [4096, 3247, 1, 1024] - - [628, 9648.5] + - [700, 9648.5] - - [4096, 3088, 1, 1024] - - [628, 9204.21] + - [700, 9204.21] - - [1024, 3213, 1, 4096] - - [646, 8054.31] + - [718, 8054.31] - - [4096, 3511, 1, 1024] - - [626, 9702.7] + - [698, 9702.7] - - [122, 122, 528, 64] - - [667, 6293.39] + - [739, 6293.39] - - [1024, 3365, 1, 4096] - - [643, 8413.62] + - [715, 8413.62] - - [1024, 3504, 1, 4096] - - [645, 8414.46] + - [717, 8414.46] - - [1024, 3442, 1, 4096] - - [648, 8684.0] + - [720, 8684.0] - - [4096, 3474, 1, 1024] - - [626, 9611.6] + - [698, 9611.6] - - [4096, 2984, 1, 1024] - - [627, 9592.82] + - [699, 9592.82] - - [1024, 3876, 1, 4096] - - [630, 9085.95] + - [702, 9085.95] - - [4096, 3337, 1, 1024] - - [628, 9246.22] + - [700, 9246.22] - - [4096, 3450, 1, 1024] - - [628, 9534.63] + - [700, 9534.63] - - [1024, 3547, 1, 4096] - - [648, 8386.73] + - [720, 8386.73] - - [4096, 3291, 1, 1024] - - [627, 9759.34] + - [699, 9759.34] - - [1024, 3340, 1, 4096] - - [647, 8237.97] + - [719, 8237.97] - - [4096, 3491, 1, 1024] - - [628, 9656.59] + - [700, 9656.59] - - [4096, 3348, 1, 1024] - - [628, 9279.15] + - [700, 9279.15] - - [78, 78, 816, 64] - - [668, 3591.09] + - [740, 3591.09] - - [4096, 3968, 1, 1024] - - [629, 9642.19] + - [701, 9642.19] - - [4096, 3906, 1, 1024] - - [629, 9485.37] + - [701, 9485.37] - - [1024, 3477, 1, 4096] - - [636, 8389.2] + - [708, 8389.2] - - [1024, 3397, 1, 4096] - - [646, 8556.88] + - [718, 8556.88] - - [4096, 3165, 1, 1024] - - [627, 9415.52] + - [699, 9415.52] - - [4096, 3470, 1, 1024] - - [626, 9598.5] + - [698, 9598.5] - - [1024, 3526, 1, 4096] - - [646, 8442.15] + - [718, 8442.15] - - [112, 112, 576, 64] - - [661, 5672.6] + - [733, 5672.6] - - [4096, 3365, 1, 1024] - - [626, 9321.83] + - [698, 9321.83] - - [4096, 3319, 1, 1024] - - [626, 9838.48] + - [698, 9838.48] - - [1024, 3401, 1, 4096] - - [648, 8460.86] + - [720, 8460.86] - - [1024, 3294, 1, 4096] - - [647, 8324.63] + - [719, 8324.63] - - [159, 159, 400, 64] - - [663, 5488.51] + - [735, 5488.51] - - [1024, 3472, 1, 4096] - - [641, 8289.77] + - [713, 8289.77] - - [4096, 3328, 1, 1024] - - [627, 9904.35] + - [699, 9904.35] - - [1024, 3861, 1, 1024] - - [632, 8917.63] + - [704, 8917.63] - - [1024, 3910, 1, 1024] - - [630, 9010.16] + - [702, 9010.16] - - [1024, 3410, 1, 4096] - - [648, 8519.63] + - [720, 8519.63] - - [1024, 3395, 1, 4096] - - [646, 8424.35] + - [718, 8424.35] - - [4096, 3282, 1, 1024] - - [626, 9743.67] + - [698, 9743.67] - - [1024, 3751, 1, 1024] - - [633, 8680.39] + - [705, 8680.39] - - [4096, 3145, 1, 1024] - - [628, 9353.37] + - [700, 9353.37] - - [4096, 3514, 1, 1024] - - [628, 9713.04] + - [700, 9713.04] - - [4096, 3944, 1, 1024] - - [628, 9563.92] + - [700, 9563.92] - - [1024, 3515, 1, 4096] - - [647, 8428.13] + - [719, 8428.13] - - [4096, 3409, 1, 1024] - - [627, 9428.77] + - [699, 9428.77] - - [4096, 3564, 1, 1024] - - [626, 9823.79] + - [698, 9823.79] - - [4096, 3299, 1, 1024] - - [628, 9793.03] + - [700, 9793.03] - - [1024, 3057, 1, 4096] - - [624, 9237.85] + - [696, 9237.85] - - [4096, 3531, 1, 1024] - - [626, 9745.64] + - [698, 9745.64] - - [4096, 3388, 1, 1024] - - [628, 9374.65] + - [700, 9374.65] - - [1024, 3189, 1, 4096] - - [648, 8084.6] + - [720, 8084.6] - - [1024, 3300, 1, 4096] - - [648, 8185.13] + - [720, 8185.13] - - [1024, 3720, 1, 4096] - - [627, 8755.11] + - [699, 8755.11] - - [1024, 3383, 1, 4096] - - [641, 8463.47] + - [713, 8463.47] - - [1024, 3494, 1, 4096] - - [648, 8676.57] + - [720, 8676.57] - - [77, 78, 816, 64] - - [664, 3548.26] + - [736, 3548.26] - - [1024, 3448, 1, 4096] - - [646, 8665.78] + - [718, 8665.78] - - [4096, 3542, 1, 1024] - - [626, 9771.88] + - [698, 9771.88] - - [1024, 3488, 1, 4096] - - [646, 8488.39] + - [718, 8488.39] - - [4096, 3405, 1, 1024] - - [628, 9426.16] + - [700, 9426.16] - - [1024, 3262, 1, 4096] - - [648, 8206.97] + - [720, 8206.97] - - [33708, 4005, 1, 1024] - - [629, 9928.16] + - [701, 9928.16] - - [1024, 3594, 1, 4096] - - [633, 8458.57] + - [705, 8458.57] - - [4096, 3103, 1, 1024] - - [629, 9243.14] + - [701, 9243.14] - - [4096, 3136, 1, 1024] - - [628, 9340.9] + - [700, 9340.9] - - [1024, 3378, 1, 4096] - - [649, 8432.45] + - [721, 8432.45] - - [10, 10, 5952, 64] - - [669, 523.353] + - [741, 523.353] - - [7, 7, 8192, 64] - - [669, 260.543] + - [741, 260.543] - - [4096, 3559, 1, 1024] - - [628, 9813.1] + - [700, 9813.1] - - [4096, 3368, 1, 1024] - - [629, 9328.66] + - [701, 9328.66] - - [4096, 3209, 1, 1024] - - [626, 9538.83] + - [698, 9538.83] - - [4096, 3322, 1, 1024] - - [628, 9839.58] + - [700, 9839.58] - - [1024, 3483, 1, 4096] - - [634, 8348.35] + - [706, 8348.35] - - [4096, 3473, 1, 1024] - - [627, 9605.79] + - [699, 9605.79] - - [4096, 3522, 1, 1024] - - [629, 9730.02] + - [701, 9730.02] - - [1024, 3532, 1, 4096] - - [647, 8474.32] + - [719, 8474.32] - - [4096, 3449, 1, 1024] - - [628, 9528.35] + - [700, 9528.35] - - [1024, 3351, 1, 4096] - - [649, 8311.23] + - [721, 8311.23] - - [1024, 3462, 1, 4096] - - [646, 8297.64] + - [718, 8297.64] - - [4096, 3396, 1, 1024] - - [628, 9400.25] + - [700, 9400.25] - - [132, 132, 480, 64] - - [674, 4089.84] + - [746, 4089.84] - - [111, 112, 576, 64] - - [660, 5529.7] + - [732, 5529.7] - - [1024, 3416, 1, 4096] - - [647, 8556.64] + - [719, 8556.64] - - [4096, 3469, 1, 1024] - - [629, 9598.77] + - [701, 9598.77] - - [1024, 3582, 1, 4096] - - [630, 8461.47] + - [702, 8461.47] - - [1024, 3230, 1, 4096] - - [647, 8188.94] + - [719, 8188.94] - - [1024, 3489, 1, 4096] - - [648, 8457.85] + - [720, 8457.85] - - [1024, 3427, 1, 4096] - - [648, 8566.59] + - [720, 8566.59] - - [1024, 3346, 1, 4096] - - [647, 8352.17] + - [719, 8352.17] - - [33708, 3977, 1, 1024] - - [629, 9868.5] + - [701, 9868.5] - - [4096, 3796, 1, 1024] - - [628, 9797.76] + - [700, 9797.76] - - [4096, 3176, 1, 1024] - - [628, 9435.39] + - [700, 9435.39] - - [4096, 3990, 1, 1024] - - [626, 9672.33] + - [698, 9672.33] - - [1024, 3257, 1, 4096] - - [649, 8225.17] + - [721, 8225.17] - - [4096, 3343, 1, 1024] - - [650, 9273.62] + - [722, 9273.62] - - [4096, 3440, 1, 1024] - - [626, 9501.48] + - [698, 9501.48] - - [33708, 4030, 1, 1024] - - [627, 9983.36] + - [699, 9983.36] - - [1024, 3190, 1, 4096] - - [648, 8192.11] + - [720, 8192.11] - - [1024, 3389, 1, 4096] - - [649, 8439.42] + - [721, 8439.42] - - [1024, 3500, 1, 4096] - - [647, 8556.12] + - [719, 8556.12] - - [1024, 3471, 1, 4096] - - [636, 8491.17] + - [708, 8491.17] - - [1024, 3438, 1, 4096] - - [649, 8567.95] + - [721, 8567.95] - - [4096, 3513, 1, 1024] - - [626, 9710.27] + - [698, 9710.27] - - [1024, 3562, 1, 4096] - - [641, 8608.94] + - [713, 8608.94] - - [4096, 3616, 1, 1024] - - [628, 9357.59] + - [700, 9357.59] - - [4096, 3955, 1, 1024] - - [627, 9589.71] + - [699, 9589.71] - - [1024, 3441, 1, 4096] - - [637, 8359.27] + - [709, 8359.27] - - [1024, 3236, 1, 4096] - - [651, 8022.6] + - [723, 8022.6] - - [1024, 3524, 1, 4096] - - [646, 8477.24] + - [718, 8477.24] - - [4096, 3460, 1, 1024] - - [626, 9581.96] + - [698, 9581.96] - - [16, 16, 3840, 64] - - [658, 1270.59] + - [730, 1270.59] - - [92, 93, 688, 64] - - [662, 4962.4] + - [734, 4962.4] - - [1024, 3384, 1, 4096] - - [637, 8409.39] + - [709, 8409.39] - - [4096, 3387, 1, 1024] - - [628, 9379.8] + - [700, 9379.8] - - [4096, 3436, 1, 1024] - - [626, 9491.93] + - [698, 9491.93] - - [4096, 3277, 1, 1024] - - [626, 9717.27] + - [698, 9717.27] - - [1024, 3457, 1, 4096] - - [646, 8279.22] + - [718, 8279.22] - - [1024, 3999, 1, 4096] - - [621, 9231.47] + - [693, 9231.47] - - [1024, 4032, 1, 4096] - - [630, 9443.62] + - [702, 9443.62] - - [4096, 3541, 1, 1024] - - [626, 9773.24] + - [698, 9773.24] - - [4096, 3334, 1, 1024] - - [626, 9242.79] + - [698, 9242.79] - - [1024, 3393, 1, 4096] - - [648, 8376.17] + - [720, 8376.17] - - [17, 17, 3632, 64] - - [670, 1425.77] + - [742, 1425.77] - - [1024, 3411, 1, 4096] - - [636, 8490.97] + - [708, 8490.97] - - [1024, 3822, 1, 1024] - - [633, 8773.44] + - [705, 8773.44] - - [1024, 3593, 1, 4096] - - [633, 8571.25] + - [705, 8571.25] - - [33708, 3822, 1, 1024] - - [627, 10056.8] + - [699, 10056.8] - - [4096, 3504, 1, 1024] - - [629, 9680.29] + - [701, 9680.29] - - [1024, 3163, 1, 4096] - - [648, 8014.43] + - [720, 8014.43] - - [1024, 3357, 1, 4096] - - [649, 8376.04] + - [721, 8376.04] - - [1024, 3906, 1, 4096] - - [630, 9108.22] + - [702, 9108.22] - - [4096, 3415, 1, 1024] - - [626, 9443.87] + - [698, 9443.87] - - [1024, 3406, 1, 4096] - - [649, 8451.64] + - [721, 8451.64] - - [4096, 3321, 1, 1024] - - [628, 9836.62] + - [700, 9836.62] - - [4096, 3584, 1, 1024] - - [629, 9915.93] + - [701, 9915.93] - - [1024, 2736, 1, 4096] - - [632, 8532.93] + - [704, 8532.93] - - [1024, 3110, 1, 4096] - - [649, 7889.29] + - [721, 7889.29] - - [33708, 3999, 1, 1024] - - [629, 9903.33] + - [701, 9903.33] - - [1024, 3093, 1, 4096] - - [647, 7919.35] + - [719, 7919.35] - - [4096, 3378, 1, 1024] - - [629, 9362.3] + - [701, 9362.3] - - [1024, 3543, 1, 4096] - - [643, 8438.16] + - [715, 8438.16] - - [33708, 3925, 1, 1024] - - [628, 10021.6] + - [700, 10021.6] - - [1024, 3352, 1, 4096] - - [649, 8333.82] + - [721, 8333.82] - - [4096, 3780, 1, 1024] - - [626, 9755.02] + - [698, 9755.02] - - [1024, 3990, 1, 4096] - - [623, 9251.02] + - [695, 9251.02] - - [4096, 3500, 1, 1024] - - [626, 9673.83] + - [698, 9673.83] - - [4096, 3996, 1, 1024] - - [627, 9694.5] + - [699, 9694.5] - - [1024, 3247, 1, 4096] - - [652, 8171.58] + - [724, 8171.58] - - [4096, 3395, 1, 1024] - - [628, 9392.04] + - [700, 9392.04] - - [1024, 3169, 1, 4096] - - [647, 7990.24] + - [719, 7990.24] - - [1024, 3088, 1, 4096] - - [647, 7890.36] + - [719, 7890.36] - - [1024, 3584, 1, 4096] - - [649, 8604.2] + - [721, 8604.2] - - [4096, 3093, 1, 1024] - - [628, 9224.88] + - [700, 9224.88] - - [1024, 3538, 1, 4096] - - [630, 8395.74] + - [702, 8395.74] - - [1024, 3996, 1, 1024] - - [631, 9208.33] + - [703, 9208.33] - - [1024, 3581, 1, 4096] - - [643, 8523.24] + - [715, 8523.24] - - [4096, 3374, 1, 1024] - - [628, 9342.81] + - [700, 9342.81] - - [33708, 3751, 1, 1024] - - [628, 9881.99] + - [700, 9881.99] - - [59, 59, 1088, 64] - - [666, 4515.54] + - [738, 4515.54] - - [4096, 3215, 1, 1024] - - [628, 9557.75] + - [700, 9557.75] - - [4096, 3312, 1, 1024] - - [626, 9834.4] + - [698, 9834.4] - - [4096, 3581, 1, 1024] - - [628, 9856.66] + - [700, 9856.66] - - [4096, 3479, 1, 1024] - - [628, 9620.35] + - [700, 9620.35] - - [4096, 3544, 1, 1024] - - [626, 9778.94] + - [698, 9778.94] - - [1024, 3870, 1, 1024] - - [631, 8935.26] + - [703, 8935.26] - - [1024, 3374, 1, 4096] - - [648, 8412.85] + - [720, 8412.85] - - [1024, 2967, 1, 4096] - - [631, 8982.97] + - [703, 8982.97] - - [41, 41, 1552, 64] - - [660, 2805.38] + - [732, 2805.38] - - [4096, 3455, 1, 1024] - - [626, 9538.89] + - [698, 9538.89] - - [4096, 3942, 1, 1024] - - [627, 9554.65] + - [699, 9554.65] - - [1024, 3528, 1, 4096] - - [646, 8438.47] + - [718, 8438.47] - - [4096, 3186, 1, 1024] - - [627, 9468.32] + - [699, 9468.32] - - [1024, 3976, 1, 1024] - - [631, 9167.08] + - [703, 9167.08] - - [1024, 3511, 1, 4096] - - [633, 8335.06] + - [705, 8335.06] - - [4096, 3573, 1, 1024] - - [626, 9855.33] + - [698, 9855.33] - - [4096, 3561, 1, 1024] - - [626, 9831.03] + - [698, 9831.03] - - [4096, 3418, 1, 1024] - - [627, 9450.68] + - [699, 9450.68] - - [33708, 3906, 1, 1024] - - [629, 9973.67] + - [701, 9973.67] - - [4096, 3259, 1, 1024] - - [626, 9685.26] + - [698, 9685.26] - - [4096, 3308, 1, 1024] - - [628, 9792.03] + - [700, 9792.03] - - [1024, 3419, 1, 4096] - - [648, 8514.53] + - [720, 8514.53] - - [1024, 3215, 1, 4096] - - [647, 8137.53] + - [719, 8137.53] - - [1024, 4030, 1, 4096] - - [629, 9290.76] + - [701, 9290.76] - - [4096, 3459, 1, 1024] - - [626, 9567.57] + - [698, 9567.57] - - [1024, 3572, 1, 4096] - - [646, 8501.43] + - [718, 8501.43] - - [1024, 3137, 1, 4096] - - [648, 7930.15] + - [720, 7930.15] - - [1024, 3312, 1, 4096] - - [649, 8378.6] + - [721, 8378.6] - - [1024, 3925, 1, 4096] - - [631, 9255.86] + - [703, 9255.86] - - [1024, 3453, 1, 4096] - - [648, 8630.76] + - [720, 8630.76] - - [4096, 3435, 1, 1024] - - [627, 9495.18] + - [699, 9495.18] - - [1024, 3176, 1, 4096] - - [648, 8087.23] + - [720, 8087.23] - - [1024, 3444, 1, 4096] - - [640, 8528.58] + - [712, 8528.58] - - [4096, 3975, 1, 1024] - - [629, 9645.34] + - [701, 9645.34] - - [4096, 3182, 1, 1024] - - [628, 9448.4] + - [700, 9448.4] - - [1024, 3475, 1, 4096] - - [647, 8404.87] + - [719, 8404.87] - - [9, 9, 6544, 64] - - [662, 425.854] + - [734, 425.854] - - [33708, 3955, 1, 1024] - - [629, 10088.4] + - [701, 10088.4] - - [4096, 3446, 1, 1024] - - [628, 9520.06] + - [700, 9520.06] - - [1024, 3138, 1, 4096] - - [647, 8053.44] + - [719, 8053.44] - - [1024, 3549, 1, 4096] - - [633, 8426.42] + - [705, 8426.42] - - [4096, 3287, 1, 1024] - - [629, 9751.34] + - [701, 9751.34] - - [1024, 3342, 1, 4096] - - [646, 8320.01] + - [718, 8320.01] - - [102, 102, 624, 64] - - [661, 4747.52] + - [733, 4747.52] - - [4096, 3519, 1, 1024] - - [628, 9716.1] + - [700, 9716.1] - - [4096, 3552, 1, 1024] - - [626, 9806.69] + - [698, 9806.69] - - [4096, 3859, 1, 1024] - - [626, 9369.94] + - [698, 9369.94] - - [33708, 3969, 1, 1024] - - [626, 9830.39] + - [698, 9830.39] - - [1024, 3369, 1, 4096] - - [647, 8379.26] + - [719, 8379.26] - - [4096, 3482, 1, 1024] - - [626, 9631.7] + - [698, 9631.7] - - [1024, 3306, 1, 4096] - - [649, 8320.06] + - [721, 8320.06] - - [1024, 3474, 1, 4096] - - [648, 8498.9] + - [720, 8498.9] - - [99, 99, 624, 64] - - [660, 4492.9] + - [732, 4492.9] - - [4096, 3377, 1, 1024] - - [626, 9369.92] + - [698, 9369.92] - - [4096, 3426, 1, 1024] - - [626, 9467.3] + - [698, 9467.3] - - [4096, 2935, 1, 1024] - - [627, 9423.74] + - [699, 9423.74] - - [4096, 3267, 1, 1024] - - [626, 9698.04] + - [698, 9698.04] - - [1024, 3299, 1, 4096] - - [647, 8264.76] + - [719, 8264.76] - - [1024, 3456, 1, 4096] - - [646, 8678.39] + - [718, 8678.39] - - [1024, 3280, 1, 4096] - - [647, 8220.69] + - [719, 8220.69] - - [1024, 3555, 1, 4096] - - [646, 8656.27] + - [718, 8656.27] - - [4096, 3499, 1, 1024] - - [628, 9663.93] + - [700, 9663.93] - - [4096, 3356, 1, 1024] - - [628, 9296.9] + - [700, 9296.9] - - [100, 102, 624, 64] - - [661, 4671.51] + - [733, 4671.51] - - [1024, 3412, 1, 4096] - - [649, 8538.05] + - [721, 8538.05] - - [1024, 2984, 1, 4096] - - [632, 9193.17] + - [704, 9193.17] - - [4096, 3141, 1, 1024] - - [628, 9349.43] + - [700, 9349.43] - - [4096, 3510, 1, 1024] - - [626, 9701.98] + - [698, 9701.98] - - [1024, 3995, 1, 1024] - - [630, 9243.4] + - [702, 9243.4] - - [1024, 3517, 1, 4096] - - [648, 8569.31] + - [720, 8569.31] - - [1024, 3455, 1, 4096] - - [648, 8560.67] + - [720, 8560.67] - - [1024, 3939, 1, 1024] - - [631, 9030.94] + - [703, 9030.94] - - [38, 38, 1680, 64] - - [660, 2459.84] + - [732, 2459.84] - - [1024, 3447, 1, 4096] - - [646, 8610.02] + - [718, 8610.02] - - [1024, 3969, 1, 4096] - - [633, 9097.33] + - [705, 9097.33] - - [4096, 3527, 1, 1024] - - [628, 9743.83] + - [700, 9743.83] - - [4096, 3336, 1, 1024] - - [628, 9248.33] + - [700, 9248.33] - - [1024, 3191, 1, 4096] - - [646, 8104.96] + - [718, 8104.96] - - [1024, 3302, 1, 4096] - - [647, 8245.09] + - [719, 8245.09] - - [1024, 3337, 1, 4096] - - [649, 8254.25] + - [721, 8254.25] - - [4096, 3290, 1, 1024] - - [628, 9759.13] + - [700, 9759.13] - - [1024, 3512, 1, 4096] - - [637, 8641.06] + - [709, 8641.06] - - [1024, 3433, 1, 4096] - - [647, 8444.7] + - [719, 8444.7] - - [4096, 3876, 1, 1024] - - [627, 9420.38] + - [699, 9420.38] - - [4096, 3490, 1, 1024] - - [628, 9641.11] + - [700, 9641.11] - - [4096, 3064, 1, 1024] - - [628, 9820.49] + - [700, 9820.49] - - [1024, 3508, 1, 4096] - - [643, 8442.24] + - [715, 8442.24] - - [1024, 3956, 1, 4096] - - [628, 9128.19] + - [700, 9128.19] - - [4096, 3417, 1, 1024] - - [628, 9448.41] + - [700, 9448.41] - - [1024, 3248, 1, 4096] - - [647, 8006.16] + - [719, 8006.16] - - [1024, 2499, 1, 4096] - - [647, 8155.19] + - [719, 8155.19] - - [1024, 3186, 1, 4096] - - [647, 8093.04] + - [719, 8093.04] - - [1024, 3180, 1, 4096] - - [649, 8097.02] + - [721, 8097.02] - - [4096, 3364, 1, 1024] - - [628, 9318.08] + - [700, 9318.08] - - [4096, 3976, 1, 1024] - - [628, 9654.47] + - [700, 9654.47] - - [4096, 3205, 1, 1024] - - [629, 9538.84] + - [701, 9538.84] - - [4096, 3318, 1, 1024] - - [626, 9838.29] + - [698, 9838.29] - - [1024, 3377, 1, 4096] - - [649, 8445.64] + - [721, 8445.64] - - [1024, 3485, 1, 4096] - - [646, 8368.83] + - [718, 8368.83] - - [4096, 3181, 1, 1024] - - [629, 9458.29] + - [701, 9458.29] - - [4096, 3550, 1, 1024] - - [626, 9783.14] + - [698, 9783.14] - - [1024, 3534, 1, 4096] - - [635, 8684.99] + - [707, 8684.99] - - [1024, 3860, 1, 1024] - - [630, 8923.18] + - [702, 8923.18] - - [160, 160, 400, 64] - - [673, 5797.69] + - [745, 5797.69] - - [4096, 3445, 1, 1024] - - [628, 9511.28] + - [700, 9511.28] - - [1024, 3391, 1, 4096] - - [649, 8541.77] + - [721, 8541.77] - - [1024, 3221, 1, 4096] - - [647, 8055.5] + - [719, 8055.5] - - [4096, 3079, 1, 1024] - - [626, 9181.04] + - [698, 9181.04] - - [4096, 3144, 1, 1024] - - [628, 9351.45] + - [700, 9351.45] - - [1024, 3270, 1, 4096] - - [648, 8367.63] + - [720, 8367.63] - - [1024, 3561, 1, 4096] - - [648, 8426.29] + - [720, 8426.29] - - [1024, 3480, 1, 4096] - - [635, 8465.0] + - [707, 8465.0] - - [4096, 3408, 1, 1024] - - [628, 9420.04] + - [700, 9420.04] - - [1024, 3418, 1, 4096] - - [649, 8481.02] + - [721, 8481.02] - - [4096, 3298, 1, 1024] - - [629, 9788.4] + - [701, 9788.4] - - [1024, 3640, 1, 1024] - - [632, 8435.44] + - [704, 8435.44] - - [1024, 3449, 1, 4096] - - [647, 8590.87] + - [719, 8590.87] - - [1024, 4020, 1, 4096] - - [625, 9168.13] + - [697, 9168.13] - - [4096, 3481, 1, 1024] - - [626, 9627.91] + - [698, 9627.91] - - [4096, 3530, 1, 1024] - - [628, 9734.68] + - [700, 9734.68] - - [1024, 3216, 1, 4096] - - [649, 8014.32] + - [721, 8014.32] - - [1024, 3840, 1, 1024] - - [632, 8908.37] + - [704, 8908.37] - - [1024, 3491, 1, 4096] - - [635, 8410.59] + - [707, 8410.59] - - [1024, 3154, 1, 4096] - - [648, 8095.69] + - [720, 8095.69] - - [4096, 3425, 1, 1024] - - [628, 9474.53] + - [700, 9474.53] - - [1024, 3348, 1, 4096] - - [646, 8202.9] + - [718, 8202.9] - - [1024, 3415, 1, 4096] - - [647, 8597.68] + - [719, 8597.68] - - [1024, 4026, 1, 1024] - - [630, 9279.09] + - [702, 9279.09] - - [1024, 3367, 1, 4096] - - [649, 8335.54] + - [721, 8335.54] - - [1024, 3259, 1, 4096] - - [649, 8285.3] + - [721, 8285.3] - - [1024, 3894, 1, 4096] - - [632, 9040.44] + - [704, 9040.44] - - [4096, 3355, 1, 1024] - - [627, 9291.67] + - [699, 9291.67] - - [4096, 3404, 1, 1024] - - [628, 9410.47] + - [700, 9410.47] - - [1024, 3308, 1, 4096] - - [649, 8336.3] + - [721, 8336.3] - - [4096, 3245, 1, 1024] - - [627, 9641.47] + - [699, 9641.47] - - [1024, 3502, 1, 4096] - - [648, 8375.9] + - [720, 8375.9] - - [33708, 4032, 1, 1024] - - [627, 9988.2] + - [699, 9988.2] - - [8, 8, 7280, 64] - - [664, 339.878] + - [736, 339.878] - - [1024, 3424, 1, 4096] - - [635, 8489.48] + - [707, 8489.48] - - [4096, 3509, 1, 1024] - - [627, 9702.29] + - [699, 9702.29] - - [4096, 3558, 1, 1024] - - [628, 9815.51] + - [700, 9815.51] - - [1024, 3900, 1, 1024] - - [631, 9014.05] + - [703, 9014.05] - - [1024, 2505, 1, 4096] - - [645, 8263.75] + - [717, 8263.75] - - [4096, 3472, 1, 1024] - - [626, 9609.61] + - [698, 9609.61] - - [1024, 3386, 1, 4096] - - [646, 8417.55] + - [718, 8417.55] - - [4096, 3383, 1, 1024] - - [628, 9364.77] + - [700, 9364.77] - - [4096, 3448, 1, 1024] - - [629, 9521.07] + - [701, 9521.07] - - [4096, 4030, 1, 1024] - - [629, 9771.56] + - [701, 9771.56] - - [4096, 3289, 1, 1024] - - [626, 9757.27] + - [698, 9757.27] - - [1024, 3459, 1, 4096] - - [648, 8422.12] + - [720, 8422.12] - - [1024, 2918, 1, 4096] - - [633, 9022.71] + - [705, 9022.71] - - [4096, 3489, 1, 1024] - - [626, 9641.9] + - [698, 9641.9] - - [4096, 3346, 1, 1024] - - [628, 9271.65] + - [700, 9271.65] - - [4096, 3572, 1, 1024] - - [628, 9829.82] + - [700, 9829.82] - - [1024, 3955, 1, 4096] - - [629, 9221.66] + - [701, 9221.66] - - [4096, 3236, 1, 1024] - - [626, 9620.72] + - [698, 9620.72] - - [4096, 3163, 1, 1024] - - [626, 9397.3] + - [698, 9397.3] - - [4096, 3468, 1, 1024] - - [626, 9601.58] + - [698, 9601.58] - - [1024, 3165, 1, 4096] - - [648, 7941.58] + - [720, 7941.58] - - [1024, 3276, 1, 4096] - - [648, 8244.96] + - [720, 8244.96] - - [1024, 3359, 1, 4096] - - [646, 8273.93] + - [718, 8273.93] - - [4096, 3363, 1, 1024] - - [628, 9315.8] + - [700, 9315.8] - - [1024, 3385, 1, 4096] - - [640, 8286.2] + - [712, 8286.2] - - [1024, 3207, 1, 4096] - - [649, 8144.02] + - [721, 8144.02] - - [1024, 3458, 1, 4096] - - [648, 8472.41] + - [720, 8472.41] - - [21, 21, 2976, 64] - - [664, 2083.3] + - [736, 2083.3] - - [4096, 3110, 1, 1024] - - [626, 9260.3] + - [698, 9260.3] - - [4096, 3925, 1, 1024] - - [629, 9526.66] + - [701, 9526.66] - - [1024, 3975, 1, 4096] - - [624, 9133.84] + - [696, 9133.84] - - [4096, 3549, 1, 1024] - - [628, 9793.77] + - [700, 9793.77] - - [4096, 3342, 1, 1024] - - [627, 9264.48] + - [699, 9264.48] - - [1024, 3859, 1, 1024] - - [630, 8933.47] + - [702, 8933.47] - - [1024, 3497, 1, 4096] - - [647, 8526.13] + - [719, 8526.13] - - [4096, 3280, 1, 1024] - - [628, 9733.32] + - [700, 9733.32] - - [1024, 3435, 1, 4096] - - [647, 8489.85] + - [719, 8489.85] - - [1024, 3354, 1, 4096] - - [647, 8248.83] + - [719, 8248.83] - - [4096, 3191, 1, 1024] - - [627, 9475.12] + - [699, 9475.12] - - [4096, 3512, 1, 1024] - - [626, 9701.37] + - [698, 9701.37] - - [1024, 3055, 1, 4096] - - [633, 9264.91] + - [705, 9264.91] - - [4096, 2499, 1, 1024] - - [628, 9574.06] + - [700, 9574.06] - - [1024, 3233, 1, 4096] - - [646, 8101.74] + - [718, 8101.74] - - [4096, 3423, 1, 1024] - - [629, 9463.5] + - [701, 9463.5] - - [1024, 3319, 1, 4096] - - [649, 8413.76] + - [721, 8413.76] - - [4096, 3297, 1, 1024] - - [626, 9782.66] + - [698, 9782.66] - - [4096, 3154, 1, 1024] - - [628, 9381.2] + - [700, 9381.2] - - [1024, 3540, 1, 4096] - - [649, 8507.53] + - [721, 8507.53] - - [1024, 3289, 1, 4096] - - [649, 8233.8] + - [721, 8233.8] - - [4096, 3529, 1, 1024] - - [628, 9741.15] + - [700, 9741.15] - - [4096, 3386, 1, 1024] - - [628, 9372.57] + - [700, 9372.57] - - [4096, 3276, 1, 1024] - - [626, 9713.76] + - [698, 9713.76] - - [1024, 3244, 1, 4096] - - [649, 8146.83] + - [721, 8146.83] - - [1024, 3182, 1, 4096] - - [646, 8115.12] + - [718, 8115.12] - - [4096, 3540, 1, 1024] - - [626, 9768.42] + - [698, 9768.42] - - [1024, 3360, 1, 4096] - - [648, 8353.31] + - [720, 8353.31] - - [1024, 3942, 1, 4096] - - [627, 9143.78] + - [699, 9143.78] - - [4096, 3403, 1, 1024] - - [629, 9412.18] + - [701, 9412.18] - - [4096, 3101, 1, 1024] - - [629, 9239.28] + - [701, 9239.28] - - [4096, 2918, 1, 1024] - - [628, 9373.75] + - [700, 9373.75] - - [1024, 3465, 1, 4096] - - [649, 8288.16] + - [721, 8288.16] - - [33708, 3780, 1, 1024] - - [628, 9971.91] + - [700, 9971.91] - - [4096, 3557, 1, 1024] - - [626, 9814.82] + - [698, 9814.82] - - [4096, 3414, 1, 1024] - - [626, 9436.63] + - [698, 9436.63] - - [1024, 3948, 1, 1024] - - [630, 9073.8] + - [702, 9073.8] - - [4096, 3320, 1, 1024] - - [628, 9834.77] + - [700, 9834.77] - - [4096, 2765, 1, 1024] - - [628, 9667.06] + - [700, 9667.06] - - [1024, 3978, 1, 4096] - - [623, 9109.6] + - [695, 9109.6] - - [4096, 3487, 1, 1024] - - [626, 9644.0] + - [698, 9644.0] - - [4096, 3520, 1, 1024] - - [628, 9728.08] + - [700, 9728.08] - - [1024, 3139, 1, 4096] - - [648, 7940.19] + - [720, 7940.19] - - [1024, 3314, 1, 4096] - - [646, 8294.01] + - [718, 8294.01] - - [4096, 3431, 1, 1024] - - [628, 9482.12] + - [700, 9482.12] - - [123, 122, 528, 64] - - [661, 6325.98] + - [733, 6325.98] - - [1024, 3446, 1, 4096] - - [642, 8468.34] + - [714, 8468.34] - - [1024, 4059, 1, 4096] - - [629, 9370.8] + - [701, 9370.8] - - [99, 102, 624, 64] - - [661, 4624.8] + - [733, 4624.8] - - [4096, 3345, 1, 1024] - - [626, 9271.32] + - [698, 9271.32] - - [4096, 3394, 1, 1024] - - [626, 9398.19] + - [698, 9398.19] - - [1024, 3927, 1, 1024] - - [631, 9041.38] + - [703, 9041.38] - - [4096, 3235, 1, 1024] - - [626, 9619.93] + - [698, 9619.93] - - [1024, 3328, 1, 4096] - - [647, 8406.09] + - [719, 8406.09] - - [33708, 3956, 1, 1024] - - [627, 10100.4] + - [699, 10100.4] - - [4096, 3467, 1, 1024] - - [628, 9586.66] + - [700, 9586.66] - - [1024, 3287, 1, 4096] - - [648, 8273.83] + - [720, 8273.83] - - [4096, 3214, 1, 1024] - - [629, 9557.49] + - [701, 9557.49] - - [4096, 3910, 1, 1024] - - [626, 9490.25] + - [698, 9490.25] - - [1024, 3780, 1, 1024] - - [633, 8706.0] + - [705, 8706.0] - - [1024, 3371, 1, 4096] - - [649, 8248.46] + - [721, 8248.46] - - [4096, 3478, 1, 1024] - - [629, 9619.62] + - [701, 9619.62] - - [1024, 3546, 1, 4096] - - [647, 8456.83] + - [719, 8456.83] - - [1024, 4012, 1, 1024] - - [630, 9253.34] + - [702, 9253.34] - - [4096, 3341, 1, 1024] - - [628, 9260.24] + - [700, 9260.24] - - [4096, 3454, 1, 1024] - - [626, 9533.62] + - [698, 9533.62] - - [4096, 3295, 1, 1024] - - [629, 9772.86] + - [701, 9772.86] - - [4096, 3072, 1, 1024] - - [626, 9887.23] + - [698, 9887.23] - - [1024, 3282, 1, 4096] - - [634, 8112.85] + - [706, 8112.85] - - [33708, 3720, 1, 1024] - - [629, 9818.85] + - [701, 9818.85] - - [1024, 3681, 1, 4096] - - [631, 8639.28] + - [703, 8639.28] - - [1024, 4050, 1, 4096] - - [629, 9291.93] + - [701, 9291.93] - - [4096, 3495, 1, 1024] - - [628, 9660.52] + - [700, 9660.52] - - [4096, 3560, 1, 1024] - - [627, 9813.8] + - [699, 9813.8] - - [4096, 3751, 1, 1024] - - [626, 9684.95] + - [698, 9684.95] - - [1024, 3414, 1, 4096] - - [647, 8555.72] + - [719, 8555.72] - - [33708, 3860, 1, 1024] - - [626, 9856.68] + - [698, 9856.68] - - [1024, 3325, 1, 4096] - - [636, 8261.21] + - [708, 8261.21] - - [4096, 3458, 1, 1024] - - [626, 9570.86] + - [698, 9570.86] - - [4096, 2967, 1, 1024] - - [626, 9544.61] + - [698, 9544.61] - - [1024, 3519, 1, 4096] - - [649, 8413.1] + - [721, 8413.1] - - [4096, 3385, 1, 1024] - - [628, 9367.34] + - [700, 9367.34] - - [4096, 3434, 1, 1024] - - [626, 9488.41] + - [698, 9488.41] - - [1024, 3552, 1, 4096] - - [647, 8456.13] + - [719, 8456.13] - - [4096, 3822, 1, 1024] - - [627, 9849.84] + - [699, 9849.84] - - [1024, 3544, 1, 4096] - - [646, 8494.56] + - [718, 8494.56] - - [4096, 3539, 1, 1024] - - [628, 9763.09] + - [700, 9763.09] - - [4096, 3332, 1, 1024] - - [626, 9232.36] + - [698, 9232.36] - - [1024, 3145, 1, 4096] - - [646, 8098.36] + - [718, 8098.36] - - [1024, 3535, 1, 4096] - - [634, 8592.8] + - [706, 8592.8] - - [1024, 3320, 1, 4096] - - [647, 8419.55] + - [719, 8419.55] - - [33708, 4012, 1, 1024] - - [629, 9940.2] + - [701, 9940.2] - - [4096, 3286, 1, 1024] - - [628, 9747.82] + - [700, 9747.82] - - [1024, 3514, 1, 4096] - - [647, 8653.69] + - [719, 8653.69] - - [93, 93, 688, 64] - - [668, 5005.79] + - [740, 5005.79] - - [1024, 2765, 1, 4096] - - [633, 8636.72] + - [705, 8636.72] - - [1024, 3452, 1, 4096] - - [646, 8445.87] + - [718, 8445.87] - - [4096, 3518, 1, 1024] - - [626, 9722.56] + - [698, 9722.56] - - [1024, 3529, 1, 4096] - - [646, 8444.32] + - [718, 8444.32] - - [4096, 3413, 1, 1024] - - [626, 9436.35] + - [698, 9436.35] - - [33708, 4050, 1, 1024] - - [628, 10026.7] + - [700, 10026.7] - - [1024, 3525, 1, 4096] - - [639, 8488.99] + - [711, 8488.99] - - [4096, 3303, 1, 1024] - - [626, 9791.05] + - [698, 9791.05] - - [1024, 3382, 1, 4096] - - [647, 8483.63] + - [719, 8483.63] - - [1024, 3390, 1, 4096] - - [646, 8552.81] + - [718, 8552.81] - - [1024, 3977, 1, 4096] - - [628, 9053.53] + - [700, 9053.53] - - [1024, 3184, 1, 4096] - - [646, 8008.81] + - [718, 8008.81] - - [4096, 3535, 1, 1024] - - [628, 9760.79] + - [700, 9760.79] - - [4096, 3376, 1, 1024] - - [629, 9341.93] + - [701, 9341.93] - - [4096, 3978, 1, 1024] - - [629, 9642.8] + - [701, 9642.8] - - [1024, 3136, 1, 4096] - - [648, 8085.12] + - [720, 8085.12] - - [1024, 3293, 1, 4096] - - [646, 8300.49] + - [718, 8300.49] - - [4096, 3266, 1, 1024] - - [627, 9691.78] + - [699, 9691.78] - - [1024, 3487, 1, 4096] - - [646, 8383.62] + - [718, 8383.62] - - [1024, 3409, 1, 4096] - - [648, 8493.25] + - [720, 8493.25] - - [4096, 3498, 1, 1024] - - [627, 9672.38] + - [699, 9672.38] - - [1024, 3520, 1, 4096] - - [649, 8488.26] + - [721, 8488.26] - - [1024, 3530, 1, 4096] - - [630, 8409.87] + - [702, 8409.87] - - [4096, 3393, 1, 1024] - - [628, 9395.43] + - [700, 9395.43] - - [4096, 3140, 1, 1024] - - [628, 9338.5] + - [700, 9338.5] - - [1024, 3536, 1, 4096] - - [649, 8642.11] + - [721, 8642.11] - - [1024, 3288, 1, 4096] - - [649, 8229.34] + - [721, 8229.34] - - [1024, 4005, 1, 4096] - - [631, 9271.04] + - [703, 9271.04] - - [1024, 3579, 1, 4096] - - [635, 8844.5] + - [707, 8844.5] - - [4096, 3372, 1, 1024] - - [626, 9339.25] + - [698, 9339.25] - - [1024, 3440, 1, 4096] - - [646, 8466.69] + - [718, 8466.69] - - [4096, 3213, 1, 1024] - - [629, 9558.85] + - [701, 9558.85] - - [123, 123, 528, 64] - - [661, 6333.59] + - [733, 6333.59] - - [100, 100, 624, 64] - - [660, 4584.12] + - [732, 4584.12] - - [1024, 3968, 1, 4096] - - [627, 9237.6] + - [699, 9237.6] - - [4096, 3477, 1, 1024] - - [627, 9618.88] + - [699, 9618.88] - - [4096, 3526, 1, 1024] - - [626, 9735.94] + - [698, 9735.94] - - [1024, 3493, 1, 4096] - - [647, 8355.13] + - [719, 8355.13] - - [1024, 3944, 1, 4096] - - [622, 9065.39] + - [694, 9065.39] - - [4096, 3453, 1, 1024] - - [627, 9533.37] + - [699, 9533.37] - - [1024, 3350, 1, 4096] - - [649, 8448.64] + - [721, 8448.64] - - [4096, 3184, 1, 1024] - - [628, 9447.38] + - [700, 9447.38] - - [1024, 3423, 1, 4096] - - [647, 8465.38] + - [719, 8465.38] - - [4096, 3351, 1, 1024] - - [626, 9282.06] + - [698, 9282.06] - - [4096, 3416, 1, 1024] - - [626, 9446.64] + - [698, 9446.64] - - [1024, 3796, 1, 4096] - - [628, 8820.34] + - [700, 8820.34] - - [4096, 3257, 1, 1024] - - [626, 9671.64] + - [698, 9671.64] - - [4096, 3306, 1, 1024] - - [628, 9795.51] + - [700, 9795.51] - - [33708, 4020, 1, 1024] - - [628, 9961.85] + - [700, 9961.85] - - [19, 19, 3264, 64] - - [658, 1736.09] + - [730, 1736.09] - - [1024, 3426, 1, 4096] - - [646, 8518.61] + - [718, 8518.61] - - [4096, 3457, 1, 1024] - - [626, 9564.56] + - [698, 9564.56] - - [1024, 2935, 1, 4096] - - [631, 9067.79] + - [703, 9067.79] - - [1024, 3046, 1, 4096] - - [631, 9242.97] + - [703, 9242.97] - - [4096, 3433, 1, 1024] - - [628, 9495.65] + - [700, 9495.65] - - [1024, 3256, 1, 4096] - - [649, 8224.23] + - [721, 8224.23] - - [1024, 3531, 1, 4096] - - [646, 8524.19] + - [718, 8524.19] - - [4096, 3180, 1, 1024] - - [626, 9443.53] + - [698, 9443.53] - - [1024, 3388, 1, 4096] - - [648, 8352.82] + - [720, 8352.82] - - [4096, 3444, 1, 1024] - - [629, 9511.03] + - [701, 9511.03] - - [1024, 3501, 1, 4096] - - [636, 8461.12] + - [708, 8461.12] - - [1024, 3266, 1, 4096] - - [634, 8147.44] + - [706, 8147.44] - - [1024, 3267, 1, 4096] - - [649, 8391.49] + - [721, 8391.49] - - [1024, 3461, 1, 4096] - - [633, 8270.29] + - [705, 8270.29] - - [4096, 3870, 1, 1024] - - [628, 9399.69] + - [700, 9399.69] - - [4096, 3517, 1, 1024] - - [626, 9725.43] + - [698, 9725.43] - - [1024, 3566, 1, 4096] - - [649, 8669.76] + - [721, 8669.76] - - [4096, 3574, 1, 1024] - - [626, 9844.63] + - [698, 9844.63] - - [1024, 3876, 1, 1024] - - [631, 8961.74] + - [703, 8961.74] - - [25, 25, 2512, 64] - - [657, 2472.54] + - [729, 2472.54] - - [4096, 3720, 1, 1024] - - [626, 9612.49] + - [698, 9612.49] - - [4096, 3248, 1, 1024] - - [628, 9644.92] + - [700, 9644.92] - - [4096, 4059, 1, 1024] - - [626, 9826.42] + - [698, 9826.42] - - [1024, 3380, 1, 4096] - - [647, 8677.91] + - [719, 8677.91] - - [4096, 3480, 1, 1024] - - [628, 9626.16] + - [700, 9626.16] - - [1024, 3335, 1, 4096] - - [648, 8302.18] + - [720, 8302.18] - - [1024, 3345, 1, 4096] - - [648, 8323.13] + - [720, 8323.13] - - [4096, 3391, 1, 1024] - - [626, 9379.48] + - [698, 9379.48] - - [4096, 3424, 1, 1024] - - [628, 9466.77] + - [700, 9466.77] - - [1024, 3394, 1, 4096] - - [634, 8373.91] + - [706, 8373.91] - - [4096, 3265, 1, 1024] - - [628, 9700.89] + - [700, 9700.89] - - [1024, 3014, 1, 4096] - - [631, 9303.09] + - [703, 9303.09] - - [4096, 3497, 1, 1024] - - [626, 9668.6] + - [698, 9668.6] - - [4096, 3354, 1, 1024] - - [628, 9294.31] + - [700, 9294.31] - - [4096, 3055, 1, 1024] - - [627, 9780.88] + - [699, 9780.88] - - [1024, 3499, 1, 4096] - - [640, 8527.04] + - [712, 8527.04] - - [1024, 3162, 1, 4096] - - [648, 8059.02] + - [720, 8059.02] - - [4096, 3244, 1, 1024] - - [628, 9636.86] + - [700, 9636.86] - - [1024, 3437, 1, 4096] - - [647, 8583.41] + - [719, 8583.41] - - [1024, 3356, 1, 4096] - - [649, 8296.95] + - [721, 8296.95] - - [4096, 3139, 1, 1024] - - [628, 9338.7] + - [700, 9338.7] - - [4096, 3508, 1, 1024] - - [628, 9700.54] + - [700, 9700.54] - - [1024, 3235, 1, 4096] - - [646, 8314.59] + - [718, 8314.59] - - [1024, 3910, 1, 4096] - - [633, 9200.21] + - [705, 9200.21] - - [4096, 3371, 1, 1024] - - [626, 9336.97] + - [698, 9336.97] - - [1024, 3751, 1, 4096] - - [633, 8827.67] + - [705, 8827.67] - - [4096, 3325, 1, 1024] - - [626, 9845.68] + - [698, 9845.68] - - [1024, 3413, 1, 4096] - - [634, 8345.78] + - [706, 8345.78] - - [1024, 3542, 1, 4096] - - [646, 8521.71] + - [718, 8521.71] - - [18, 18, 3440, 64] - - [662, 1578.24] + - [734, 1578.24] - - [101, 102, 624, 64] - - [660, 4705.28] + - [732, 4705.28] - - [33708, 3900, 1, 1024] - - [626, 9951.05] + - [698, 9951.05] - - [4096, 3525, 1, 1024] - - [627, 9744.47] + - [699, 9744.47] - - [4096, 3382, 1, 1024] - - [627, 9359.03] + - [699, 9359.03] - - [102, 100, 624, 64] - - [661, 4671.51] + - [733, 4671.51] - - [15, 15, 4096, 64] - - [665, 1129.17] + - [737, 1129.17] - - [1024, 3339, 1, 4096] - - [635, 8326.37] + - [707, 8326.37] - - [4096, 3288, 1, 1024] - - [628, 9761.48] + - [700, 9761.48] - - [92, 92, 688, 64] - - [668, 4903.87] + - [740, 4903.87] - - [1024, 3141, 1, 4096] - - [646, 7975.64] + - [718, 7975.64] - - [1024, 3168, 1, 4096] - - [646, 8083.74] + - [718, 8083.74] - - [4096, 3488, 1, 1024] - - [628, 9646.77] + - [700, 9646.77] - - [4096, 3046, 1, 1024] - - [627, 9767.58] + - [699, 9767.58] - - [1024, 3362, 1, 4096] - - [649, 8458.15] + - [721, 8458.15] - - [33708, 3942, 1, 1024] - - [627, 10060.4] + - [699, 10060.4] - - [4096, 3399, 1, 1024] - - [628, 9406.57] + - [700, 9406.57] - - [1024, 3720, 1, 1024] - - [630, 8639.16] + - [702, 8639.16] - - [4096, 3563, 1, 1024] - - [626, 9836.55] + - [698, 9836.55] - - [1024, 3273, 1, 4096] - - [649, 8221.62] + - [721, 8221.62] - - [4096, 3162, 1, 1024] - - [628, 9400.19] + - [700, 9400.19] - - [1024, 3467, 1, 4096] - - [647, 8342.42] + - [719, 8342.42] - - [1024, 3130, 1, 4096] - - [648, 7933.88] + - [720, 7933.88] - - [1024, 3405, 1, 4096] - - [655, 8406.59] + - [727, 8406.59] - - [4096, 3362, 1, 1024] - - [626, 9312.04] + - [698, 9312.04] - - [1024, 3960, 1, 1024] - - [630, 9082.26] + - [702, 9082.26] - - [2048, 128, 1, 4096] - - [680, 5986.62] + - [752, 5986.62] - - [1024, 3712, 1, 36548] - - [678, 9456.25] + - [750, 9456.25] - - [1024, 128, 1, 1024] - - [681, 3631.53] + - [753, 3631.53] - - [3072, 128, 1, 4096] - - [677, 6145.6] + - [749, 6145.6] - - [1024, 3712, 1, 1024] - - [679, 8933.98] + - [751, 8933.98] + - - [256, 256, 192, 64] + - [756, 8264.74] + - - [768, 4096, 1, 768] + - [769, 9642.18] + - - [768, 64, 1, 768] + - [766, 1850.53] + - - [768, 1280, 1, 768] + - [769, 8738.23] + - - [30522, 320, 1, 768] + - [770, 9733.69] + - - [128, 128, 96, 64] + - [759, 5470.93] + - - [2, 16, 1, 768] + - [762, 2.57742] + - - [30522, 1280, 1, 768] + - [768, 10128.0] + - - [30522, 640, 1, 768] + - [769, 9987.71] + - - [2, 8, 1, 768] + - [761, 1.06] + - - [768, 4096, 1, 3072] + - [771, 9479.51] + - - [768, 32, 1, 768] + - [765, 880.434] + - - [2, 64, 1, 768] + - [762, 10.09024] + - - [256, 256, 96, 64] + - [756, 7614.57] + - - [64, 64, 768, 64] + - [758, 5354.53] + - - [30522, 160, 1, 768] + - [767, 7740.21] + - - [768, 320, 1, 768] + - [760, 5423.77] + - - [128, 128, 384, 64] + - [757, 7180.08] + - - [768, 16, 1, 768] + - [763, 706.476] + - - [3072, 4096, 1, 768] + - [772, 9961.84] + - - [2048, 512, 1, 100] + - [774, 5180.81] + - - [1024, 200, 1, 560] + - [775, 4061.29] + - - [256, 1280, 1, 1024] + - [782, 4337.54] + - - [256, 44505, 1, 1024] + - [818, 8597.79] + - - [10240, 8976, 1, 256] + - [821, 9471.53] + - - [256, 7168, 1, 1024] + - [812, 6718.66] + - - [8448, 8976, 1, 256] + - [804, 9601.41] + - - [18944, 8976, 1, 256] + - [813, 9666.36] + - - [256, 19200, 1, 1024] + - [789, 7489.04] + - - [5632, 8976, 1, 256] + - [801, 9358.49] + - - [256, 23552, 1, 1024] + - [816, 7980.99] + - - [256, 6656, 1, 1024] + - [816, 6287.32] + - - [256, 14336, 1, 1024] + - [811, 7049.36] + - - [256, 12544, 1, 1024] + - [789, 6728.57] + - - [2048, 684, 1, 768] + - [806, 8479.28] + - - [5376, 8976, 1, 256] + - [801, 9519.61] + - - [256, 5888, 1, 1024] + - [821, 6012.5] + - - [19968, 8976, 1, 256] + - [813, 9684.77] + - - [3840, 8976, 1, 256] + - [798, 9461.99] + - - [4608, 8976, 1, 256] + - [798, 9305.92] + - - [256, 684, 1, 1024] + - [824, 3513.16] + - - [256, 22016, 1, 1024] + - [789, 7643.89] + - - [256, 23296, 1, 1024] + - [818, 8048.22] + - - [4864, 8976, 1, 256] + - [796, 9545.72] + - - [256, 7424, 1, 1024] + - [814, 6770.75] + - - [18176, 8976, 1, 256] + - [821, 9729.57] + - - [256, 15104, 1, 1024] + - [810, 7289.18] + - - [8192, 8976, 1, 256] + - [813, 9395.59] + - - [256, 16128, 1, 1024] + - [813, 7461.38] + - - [13312, 8976, 1, 256] + - [821, 9551.07] + - - [256, 21504, 1, 1024] + - [818, 7636.03] + - - [6400, 8976, 1, 256] + - [805, 9561.06] + - - [256, 8960, 1, 1024] + - [780, 6292.46] + - - [1792, 8976, 1, 256] + - [795, 9372.28] + - - [13824, 8976, 1, 256] + - [813, 9585.37] + - - [11776, 8976, 1, 256] + - [813, 9560.44] + - - [256, 20992, 1, 1024] + - [811, 7490.75] + - - [20480, 8976, 1, 256] + - [821, 9610.8] + - - [5888, 8976, 1, 256] + - [792, 9565.3] + - - [256, 10496, 1, 1024] + - [783, 6632.06] + - - [21248, 8976, 1, 256] + - [813, 9755.87] + - - [5120, 8976, 1, 256] + - [821, 9244.69] + - - [7168, 8976, 1, 256] + - [813, 9388.52] + - - [2048, 1536, 1, 768] + - [802, 9446.14] + - - [256, 8192, 1, 1024] + - [807, 6948.99] + - - [4096, 8976, 1, 256] + - [812, 9116.04] + - - [3328, 8976, 1, 256] + - [805, 9434.65] + - - [1280, 8976, 1, 256] + - [803, 9129.9] + - - [2560, 8976, 1, 256] + - [800, 9199.58] + - - [3072, 8976, 1, 256] + - [815, 8963.7] + - - [256, 11776, 1, 1024] + - [793, 6869.9] + - - [18688, 8976, 1, 256] + - [821, 9726.31] + - - [15104, 8976, 1, 256] + - [821, 9715.81] + - - [23552, 8976, 1, 256] + - [813, 9648.52] + - - [6144, 8976, 1, 256] + - [821, 9339.9] + - - [12544, 8976, 1, 256] + - [821, 9654.55] + - - [256, 11264, 1, 1024] + - [794, 6815.08] + - - [2048, 114, 1, 512] + - [825, 4583.6] + - - [4352, 8976, 1, 256] + - [805, 9471.5] + - - [15360, 8976, 1, 256] + - [821, 9583.87] + - - [256, 31488, 1, 1024] + - [820, 8438.11] + - - [28672, 8976, 1, 256] + - [813, 9688.95] + - - [256, 18176, 1, 1024] + - [789, 7405.19] + - - [9728, 8976, 1, 256] + - [821, 9524.25] + - - [256, 2816, 1, 1024] + - [785, 5405.76] + - - [256, 18944, 1, 1024] + - [789, 7503.51] + - - [256, 3584, 1, 1024] + - [788, 6107.25] + - - [7936, 8976, 1, 256] + - [801, 9608.41] + - - [19712, 8976, 1, 256] + - [821, 9736.35] + - - [256, 14848, 1, 1024] + - [794, 7163.52] + - - [256, 8448, 1, 1024] + - [794, 6372.66] + - - [256, 6400, 1, 1024] + - [808, 6395.81] + - - [256, 6144, 1, 1024] + - [819, 6490.32] + - - [9472, 8976, 1, 256] + - [798, 9610.02] + - - [256, 9984, 1, 1024] + - [781, 6484.85] + - - [684, 8976, 1, 256] + - [790, 8128.63] + - - [20992, 8976, 1, 256] + - [813, 9689.75] + - - [2048, 684, 1, 512] + - [797, 7241.88] + - - [2048, 114, 1, 768] + - [823, 4872.56] + - - [8960, 8976, 1, 256] + - [796, 9603.45] + - - [2048, 1536, 1, 512] + - [799, 8830.21] + - - [256, 3328, 1, 1024] + - [787, 5612.65] + - - [33536, 8976, 1, 256] + - [813, 9797.81] + - - [2048, 8976, 1, 256] + - [813, 8975.56] + - - [10496, 8976, 1, 256] + - [804, 9654.53] + - - [256, 5376, 1, 1024] + - [822, 5626.44] + - - [256, 21248, 1, 1024] + - [791, 7525.55] + - - [256, 13312, 1, 1024] + - [789, 6767.21] + - - [16128, 8976, 1, 256] + - [813, 9715.67] + - - [2304, 8976, 1, 256] + - [786, 9433.93] + - - [256, 4864, 1, 1024] + - [776, 5743.65] + - - [17152, 8976, 1, 256] + - [821, 9709.04] + - - [15872, 8976, 1, 256] + - [821, 9657.67] + - - [9984, 8976, 1, 256] + - [798, 9639.84] + - - [256, 14592, 1, 1024] + - [810, 7224.02] + - - [256, 33536, 1, 1024] + - [817, 8147.41] + - - [11264, 8976, 1, 256] + - [813, 9510.06] + - - [31488, 8976, 1, 256] + - [821, 9799.41] + - - [256, 20480, 1, 1024] + - [794, 7498.3] + - - [44505, 8976, 1, 256] + - [805, 9804.88] + - - [13568, 8976, 1, 256] + - [813, 9680.34] + - - [256, 11520, 1, 1024] + - [793, 6805.36] + - - [256, 7936, 1, 1024] + - [809, 6971.87] + - - [2048, 256, 1, 768] + - [779, 7129.23] + - - [256, 4608, 1, 1024] + - [777, 5463.01] + - - [256, 2304, 1, 1024] + - [784, 4842.79] + - - [256, 2560, 1, 1024] + - [785, 5309.35] + - - [2816, 8976, 1, 256] + - [796, 9409.66] - null diff --git a/scripts/performance/sgemm_bert.sh b/scripts/performance/sgemm_bert.sh new file mode 100755 index 000000000..2b0ce38fc --- /dev/null +++ b/scripts/performance/sgemm_bert.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1600 -n 512 -k 1024 --alpha -1.0 --lda 1600 --ldb 1024 --beta 1.0 --ldc 1600 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 512 -k 1 --alpha -1.0 --lda 1024 --ldb 1 --beta 1.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 512 -k 64 --alpha -1.0 --lda 1024 --ldb 64 --beta 1.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 64 -n 512 -k 1 --alpha -1.0 --lda 64 --ldb 1 --beta 1.0 --ldc 64 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 2048 -n 512 -k 1 --alpha -1.0 --lda 2048 --ldb 1 --beta 1.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 100 -n 512 -k 2048 --alpha -1.0 --lda 100 --ldb 2048 --beta 1.0 --ldc 100 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1 -n 512 -k 1 --alpha -1.0 --lda 1 --ldb 1 --beta 1.0 --ldc 1 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 512 -n 200 -k 32 --alpha -1.0 --lda 512 --ldb 32 --beta 1.0 --ldc 512 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 560 -n 200 -k 1024 --alpha -1.0 --lda 560 --ldb 1024 --beta 1.0 --ldc 560 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 32 -n 200 -k 1 --alpha -1.0 --lda 32 --ldb 1 --beta 1.0 --ldc 32 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 200 -k 1 --alpha -1.0 --lda 1024 --ldb 1 --beta 1.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1 -n 200 -k 1 --alpha -1.0 --lda 1 --ldb 1 --beta 1.0 --ldc 1 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 512 -n 200 -k 1 --alpha -1.0 --lda 512 --ldb 1 --beta 1.0 --ldc 512 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 64 -k 512 --alpha -1.0 --lda 1024 --ldb 64 --beta 1.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 100 -n 2048 -k 512 --alpha -1.0 --lda 100 --ldb 2048 --beta 1.0 --ldc 100 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 2048 -n 1 -k 512 --alpha -1.0 --lda 2048 --ldb 1 --beta 1.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 2048 -n 2048 -k 512 --alpha -1.0 --lda 2048 --ldb 2048 --beta 1.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1600 -n 1024 -k 512 --alpha -1.0 --lda 1600 --ldb 1024 --beta 1.0 --ldc 1600 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 1024 -k 512 --alpha -1.0 --lda 1024 --ldb 1024 --beta 1.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 560 -n 1024 -k 200 --alpha -1.0 --lda 560 --ldb 1024 --beta 1.0 --ldc 560 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 1024 -k 200 --alpha -1.0 --lda 1024 --ldb 1024 --beta 1.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 1 -k 200 --alpha -1.0 --lda 1024 --ldb 1 --beta 1.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 512 -n 32 -k 200 --alpha -1.0 --lda 512 --ldb 32 --beta 1.0 --ldc 512 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 200 --alpha -1.0 --lda 512 --ldb 512 --beta 1.0 --ldc 512 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 1024 -n 512 -k 1600 --alpha -1.0 --lda 1600 --ldb 1600 --beta 1.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2048 -n 512 -k 100 --alpha -1.0 --lda 100 --ldb 100 --beta 1.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 1024 -n 200 -k 560 --alpha -1.0 --lda 560 --ldb 560 --beta 1.0 --ldc 1024 diff --git a/scripts/performance/sgemm_dlrm.sh b/scripts/performance/sgemm_dlrm.sh new file mode 100755 index 000000000..2b0ce38fc --- /dev/null +++ b/scripts/performance/sgemm_dlrm.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1600 -n 512 -k 1024 --alpha -1.0 --lda 1600 --ldb 1024 --beta 1.0 --ldc 1600 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 512 -k 1 --alpha -1.0 --lda 1024 --ldb 1 --beta 1.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 512 -k 64 --alpha -1.0 --lda 1024 --ldb 64 --beta 1.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 64 -n 512 -k 1 --alpha -1.0 --lda 64 --ldb 1 --beta 1.0 --ldc 64 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 2048 -n 512 -k 1 --alpha -1.0 --lda 2048 --ldb 1 --beta 1.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 100 -n 512 -k 2048 --alpha -1.0 --lda 100 --ldb 2048 --beta 1.0 --ldc 100 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1 -n 512 -k 1 --alpha -1.0 --lda 1 --ldb 1 --beta 1.0 --ldc 1 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 512 -n 200 -k 32 --alpha -1.0 --lda 512 --ldb 32 --beta 1.0 --ldc 512 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 560 -n 200 -k 1024 --alpha -1.0 --lda 560 --ldb 1024 --beta 1.0 --ldc 560 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 32 -n 200 -k 1 --alpha -1.0 --lda 32 --ldb 1 --beta 1.0 --ldc 32 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 200 -k 1 --alpha -1.0 --lda 1024 --ldb 1 --beta 1.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1 -n 200 -k 1 --alpha -1.0 --lda 1 --ldb 1 --beta 1.0 --ldc 1 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 512 -n 200 -k 1 --alpha -1.0 --lda 512 --ldb 1 --beta 1.0 --ldc 512 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 64 -k 512 --alpha -1.0 --lda 1024 --ldb 64 --beta 1.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 100 -n 2048 -k 512 --alpha -1.0 --lda 100 --ldb 2048 --beta 1.0 --ldc 100 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 2048 -n 1 -k 512 --alpha -1.0 --lda 2048 --ldb 1 --beta 1.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 2048 -n 2048 -k 512 --alpha -1.0 --lda 2048 --ldb 2048 --beta 1.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1600 -n 1024 -k 512 --alpha -1.0 --lda 1600 --ldb 1024 --beta 1.0 --ldc 1600 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 1024 -k 512 --alpha -1.0 --lda 1024 --ldb 1024 --beta 1.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 560 -n 1024 -k 200 --alpha -1.0 --lda 560 --ldb 1024 --beta 1.0 --ldc 560 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 1024 -k 200 --alpha -1.0 --lda 1024 --ldb 1024 --beta 1.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 1 -k 200 --alpha -1.0 --lda 1024 --ldb 1 --beta 1.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 512 -n 32 -k 200 --alpha -1.0 --lda 512 --ldb 32 --beta 1.0 --ldc 512 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 200 --alpha -1.0 --lda 512 --ldb 512 --beta 1.0 --ldc 512 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 1024 -n 512 -k 1600 --alpha -1.0 --lda 1600 --ldb 1600 --beta 1.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2048 -n 512 -k 100 --alpha -1.0 --lda 100 --ldb 100 --beta 1.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 1024 -n 200 -k 560 --alpha -1.0 --lda 560 --ldb 560 --beta 1.0 --ldc 1024 diff --git a/scripts/performance/sgemm_phantom.sh b/scripts/performance/sgemm_phantom.sh new file mode 100755 index 000000000..af3cf06d6 --- /dev/null +++ b/scripts/performance/sgemm_phantom.sh @@ -0,0 +1,291 @@ +#!/bin/bash + +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 22016 --alpha 1.0 --lda 256 --ldb 22016 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 768 -n 215 -k 2048 --alpha 1.0 --lda 768 --ldb 2048 --beta 0.0 --ldc 768 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 44505 --alpha 1.0 --lda 256 --ldb 44505 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 13568 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 17152 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 17152 --alpha 1.0 --lda 256 --ldb 17152 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 16128 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 33536 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 33536 --alpha 1.0 --lda 256 --ldb 33536 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 20480 --alpha 1.0 --lda 256 --ldb 20480 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 22016 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 7168 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 18944 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 14336 --alpha 1.0 --lda 256 --ldb 14336 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 9728 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 8960 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 9984 --alpha 1.0 --lda 256 --ldb 9984 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 5632 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 28672 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 7424 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 12288 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 10496 --alpha 1.0 --lda 256 --ldb 10496 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 7424 --alpha 1.0 --lda 256 --ldb 7424 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 12288 --alpha 1.0 --lda 256 --ldb 12288 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 19712 --alpha 1.0 --lda 256 --ldb 19712 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 10496 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 23552 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 6400 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 21504 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 20480 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 13312 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 5888 --alpha 1.0 --lda 256 --ldb 5888 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 11520 --alpha 1.0 --lda 256 --ldb 11520 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 8448 --alpha 1.0 --lda 256 --ldb 8448 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 4352 --alpha 1.0 --lda 256 --ldb 4352 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 9984 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 2048 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 4608 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 3328 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 7168 --alpha 1.0 --lda 256 --ldb 7168 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 8192 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 3072 --alpha 1.0 --lda 256 --ldb 3072 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 768 -n 256 -k 2048 --alpha 1.0 --lda 768 --ldb 2048 --beta 0.0 --ldc 768 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 19712 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 5632 --alpha 1.0 --lda 256 --ldb 5632 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 40448 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 7936 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 1792 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 10240 --alpha 1.0 --lda 256 --ldb 10240 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 20992 --alpha 1.0 --lda 256 --ldb 20992 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 2560 --alpha 1.0 --lda 256 --ldb 2560 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 15104 --alpha 1.0 --lda 256 --ldb 15104 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 2816 --alpha 1.0 --lda 256 --ldb 2816 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 2304 --alpha 1.0 --lda 256 --ldb 2304 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 14848 --alpha 1.0 --lda 256 --ldb 14848 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 11264 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 9472 --alpha 1.0 --lda 256 --ldb 9472 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 15872 --alpha 1.0 --lda 256 --ldb 15872 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 20992 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 8448 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 11008 --alpha 1.0 --lda 256 --ldb 11008 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 4096 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 7680 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 8960 --alpha 1.0 --lda 256 --ldb 8960 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 6656 --alpha 1.0 --lda 256 --ldb 6656 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 5376 --alpha 1.0 --lda 256 --ldb 5376 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 10240 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 2048 --alpha 1.0 --lda 256 --ldb 2048 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 512 -n 215 -k 2048 --alpha 1.0 --lda 512 --ldb 2048 --beta 0.0 --ldc 512 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 6144 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 1536 --alpha 1.0 --lda 256 --ldb 1536 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 3840 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 512 -n 256 -k 2048 --alpha 1.0 --lda 512 --ldb 2048 --beta 0.0 --ldc 512 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 15104 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 9728 --alpha 1.0 --lda 256 --ldb 9728 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 14592 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 5120 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 6144 --alpha 1.0 --lda 256 --ldb 6144 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 11520 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 14848 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 8704 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 8192 --alpha 1.0 --lda 256 --ldb 8192 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 3072 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 4864 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 19968 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 2560 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 4864 --alpha 1.0 --lda 256 --ldb 4864 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 19968 --alpha 1.0 --lda 256 --ldb 19968 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 14336 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1024 -n 11008 -k 256 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 8976 -k 26112 --alpha 1.0 --lda 256 --ldb 26112 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 22016 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 22016 -k 8976 --alpha 1.0 --lda 256 --ldb 22016 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 15104 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 15104 -k 8976 --alpha 1.0 --lda 256 --ldb 15104 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 19968 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 13568 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 13568 -k 8976 --alpha 1.0 --lda 256 --ldb 13568 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 19712 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 17152 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 17152 -k 8976 --alpha 1.0 --lda 256 --ldb 17152 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 16128 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 16128 -k 8976 --alpha 1.0 --lda 256 --ldb 16128 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 33536 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 33536 -k 8976 --alpha 1.0 --lda 256 --ldb 33536 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 20480 -k 8976 --alpha 1.0 --lda 256 --ldb 20480 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 1280 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 1280 -k 8976 --alpha 1.0 --lda 256 --ldb 1280 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 15872 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 13312 -k 8976 --alpha 1.0 --lda 256 --ldb 13312 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 18944 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 5632 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 6400 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 3584 -k 8976 --alpha 1.0 --lda 256 --ldb 3584 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 6144 -k 8976 --alpha 1.0 --lda 256 --ldb 6144 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 20992 -k 8976 --alpha 1.0 --lda 256 --ldb 20992 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 17408 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 5888 -k 8976 --alpha 1.0 --lda 256 --ldb 5888 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 11008 -k 8976 --alpha 1.0 --lda 256 --ldb 11008 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 12032 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 7168 -k 8976 --alpha 1.0 --lda 256 --ldb 7168 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 2816 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 11520 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 7680 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 2816 -k 8976 --alpha 1.0 --lda 256 --ldb 2816 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 3328 -k 8976 --alpha 1.0 --lda 256 --ldb 3328 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 10496 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 14336 -k 8976 --alpha 1.0 --lda 256 --ldb 14336 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 12544 -k 8976 --alpha 1.0 --lda 256 --ldb 12544 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 14336 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 9728 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 10496 -k 8976 --alpha 1.0 --lda 256 --ldb 10496 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 19968 -k 8976 --alpha 1.0 --lda 256 --ldb 19968 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 21504 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 8192 -k 8976 --alpha 1.0 --lda 256 --ldb 8192 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 2560 -k 8976 --alpha 1.0 --lda 256 --ldb 2560 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 9984 -k 8976 --alpha 1.0 --lda 256 --ldb 9984 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 12800 -k 8976 --alpha 1.0 --lda 256 --ldb 12800 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 3328 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 32512 -k 8976 --alpha 1.0 --lda 256 --ldb 32512 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 3840 -k 8976 --alpha 1.0 --lda 256 --ldb 3840 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 5120 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 44505 -k 8976 --alpha 1.0 --lda 256 --ldb 44505 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 768 -n 2048 -k 256 --alpha 1.0 --lda 768 --ldb 2048 --beta 0.0 --ldc 768 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 8448 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 14848 -k 8976 --alpha 1.0 --lda 256 --ldb 14848 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 28672 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 5120 -k 8976 --alpha 1.0 --lda 256 --ldb 5120 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 5632 -k 8976 --alpha 1.0 --lda 256 --ldb 5632 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 21248 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 9984 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 21248 -k 8976 --alpha 1.0 --lda 256 --ldb 21248 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 2304 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 11264 -k 8976 --alpha 1.0 --lda 256 --ldb 11264 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 4608 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 4096 -k 8976 --alpha 1.0 --lda 256 --ldb 4096 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 3072 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 8192 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 7936 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 3584 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 10240 -k 8976 --alpha 1.0 --lda 256 --ldb 10240 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 4096 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 14848 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 512 -n 2048 -k 256 --alpha 1.0 --lda 512 --ldb 2048 --beta 0.0 --ldc 512 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 26112 -k 8976 --alpha 1.0 --lda 256 --ldb 26112 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 17408 -k 8976 --alpha 1.0 --lda 256 --ldb 17408 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 11776 -k 8976 --alpha 1.0 --lda 256 --ldb 11776 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 8960 -k 8976 --alpha 1.0 --lda 256 --ldb 8960 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 5376 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 9728 -k 8976 --alpha 1.0 --lda 256 --ldb 9728 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 1024 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 4352 -k 8976 --alpha 1.0 --lda 256 --ldb 4352 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 11520 -k 8976 --alpha 1.0 --lda 256 --ldb 11520 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 18688 -k 8976 --alpha 1.0 --lda 256 --ldb 18688 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 4352 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1024 -n 256 -k 6144 --alpha 1.0 --lda 1024 --ldb 256 --beta 0.0 --ldc 1024 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 4864 -k 8976 --alpha 1.0 --lda 256 --ldb 4864 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 15872 -k 8976 --alpha 1.0 --lda 256 --ldb 15872 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 684 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 684 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2048 -n 684 -k 512 --alpha 1.0 --lda 512 --ldb 512 --beta 0.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2048 -n 1536 -k 512 --alpha 1.0 --lda 512 --ldb 512 --beta 0.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2048 -n 1536 -k 768 --alpha 1.0 --lda 768 --ldb 768 --beta 0.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 22344 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2048 -n 114 -k 768 --alpha 1.0 --lda 768 --ldb 768 --beta 0.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 18176 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 18176 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 18176 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 15104 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 15104 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 15104 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 44505 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 44505 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2048 -n 114 -k 512 --alpha 1.0 --lda 512 --ldb 512 --beta 0.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 44505 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2048 -n 215 -k 512 --alpha 1.0 --lda 512 --ldb 512 --beta 0.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2048 -n 215 -k 768 --alpha 1.0 --lda 768 --ldb 768 --beta 0.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 19968 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 19968 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 16128 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 16128 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 33536 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 33536 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 33536 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 20992 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 20992 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 20992 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 20480 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 20480 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 19712 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 19712 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 1280 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 1280 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 1280 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 7168 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 22016 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 13568 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 13568 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 18944 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 31488 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 3072 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 3072 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 9984 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 9984 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 9984 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2048 -n 256 -k 512 --alpha 1.0 --lda 512 --ldb 512 --beta 0.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 5888 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 5888 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 5888 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 13312 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 13312 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 11776 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 9728 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 9728 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 15360 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 15360 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2048 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 7424 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2816 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 2816 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 10496 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 10496 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 21248 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 21248 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 14336 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 6400 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 18688 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 18688 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 23296 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 9472 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 9472 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 18944 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 18944 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 3584 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 3328 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 3328 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 13312 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 3328 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 6400 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 6400 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 7936 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 15872 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 15872 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 5376 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 2560 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 13824 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 13824 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 4352 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 4352 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 11776 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 11776 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 2304 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 2816 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 684 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 7936 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 7936 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 11264 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 11264 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 6656 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 21504 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 4096 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 4096 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 1792 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 1792 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 28672 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 28672 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 31488 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 31488 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 11520 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 1536 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 5376 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 5376 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 6144 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 8448 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 5632 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 5632 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2048 -n 256 -k 768 --alpha 1.0 --lda 768 --ldb 768 --beta 0.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 19200 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 4864 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 4864 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 16128 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 12544 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 6144 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 6144 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 8448 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 8448 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2304 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 2304 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 4608 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 12544 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 12544 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 8960 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 8960 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 20480 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 5120 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 5120 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 8192 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 8192 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 8192 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 14848 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 23552 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 10496 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 14592 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2560 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 2560 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 11264 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 4608 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 4608 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 8960 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 10240 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 10240 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 21248 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 3840 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 3840 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 23552 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 23552 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2048 -n 684 -k 768 --alpha 1.0 --lda 768 --ldb 768 --beta 0.0 --ldc 2048 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 7168 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 7168 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 256 -n 4864 -k 1024 --alpha 1.0 --lda 1024 --ldb 1024 --beta 0.0 --ldc 256 +./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 17152 -n 8976 -k 256 --alpha 1.0 --lda 256 --ldb 256 --beta 0.0 --ldc 17152 diff --git a/scripts/performance/sgemm_winograd.sh b/scripts/performance/sgemm_winograd.sh new file mode 100755 index 000000000..3065358bb --- /dev/null +++ b/scripts/performance/sgemm_winograd.sh @@ -0,0 +1,343 @@ +#!/bin/bash + +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 24 -k 288 --alpha 1.0 --lda 288 --stride_a 147456 --ldb 288 --stride_b 6912 --beta 0.0 --ldc 24 --stride_c 12288 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 486 -k 288 --alpha 1.0 --lda 288 --stride_a 147456 --ldb 288 --stride_b 139968 --beta 0.0 --ldc 486 --stride_c 248832 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 784 --alpha 1.0 --lda 784 --stride_a 50176 --ldb 784 --stride_b 100352 --beta 0.0 --ldc 128 --stride_c 8192 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 12544 --alpha 1.0 --lda 12544 --stride_a 802816 --ldb 12544 --stride_b 1605632 --beta 0.0 --ldc 128 --stride_c 8192 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 1568 --alpha 1.0 --lda 1568 --stride_a 100352 --ldb 1568 --stride_b 200704 --beta 0.0 --ldc 128 --stride_c 8192 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 25088 --alpha 1.0 --lda 25088 --stride_a 1605632 --ldb 25088 --stride_b 3211264 --beta 0.0 --ldc 128 --stride_c 8192 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 50176 --alpha 1.0 --lda 50176 --stride_a 3211264 --ldb 50176 --stride_b 6422528 --beta 0.0 --ldc 128 --stride_c 8192 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 53824 --alpha 1.0 --lda 53824 --stride_a 3444736 --ldb 53824 --stride_b 6889472 --beta 0.0 --ldc 128 --stride_c 8192 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 6272 --alpha 1.0 --lda 6272 --stride_a 401408 --ldb 6272 --stride_b 802816 --beta 0.0 --ldc 128 --stride_c 8192 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 32 -n 64 -k 90 --alpha 1.0 --lda 90 --stride_a 2880 --ldb 90 --stride_b 5760 --beta 0.0 --ldc 64 --stride_c 2048 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 32 -n 64 -k 1440 --alpha 1.0 --lda 1440 --stride_a 46080 --ldb 1440 --stride_b 92160 --beta 0.0 --ldc 64 --stride_c 2048 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 112 -n 224 -k 512 --alpha 1.0 --lda 512 --stride_a 57344 --ldb 512 --stride_b 114688 --beta 0.0 --ldc 224 --stride_c 25088 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 512 --alpha 1.0 --lda 512 --stride_a 65536 --ldb 512 --stride_b 131072 --beta 0.0 --ldc 256 --stride_c 32768 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 144 -n 288 -k 512 --alpha 1.0 --lda 512 --stride_a 73728 --ldb 512 --stride_b 147456 --beta 0.0 --ldc 288 --stride_c 41472 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 160 -n 320 -k 512 --alpha 1.0 --lda 512 --stride_a 81920 --ldb 512 --stride_b 163840 --beta 0.0 --ldc 320 --stride_c 51200 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 384 -k 512 --alpha 1.0 --lda 512 --stride_a 98304 --ldb 512 --stride_b 196608 --beta 0.0 --ldc 384 --stride_c 73728 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 208 -k 512 --alpha 1.0 --lda 512 --stride_a 49152 --ldb 512 --stride_b 106496 --beta 0.0 --ldc 208 --stride_c 19968 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 384 -k 2048 --alpha 1.0 --lda 2048 --stride_a 393216 --ldb 2048 --stride_b 786432 --beta 0.0 --ldc 384 --stride_c 73728 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 384 -k 4096 --alpha 1.0 --lda 4096 --stride_a 786432 --ldb 4096 --stride_b 1572864 --beta 0.0 --ldc 384 --stride_c 73728 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 384 -k 800 --alpha 1.0 --lda 800 --stride_a 153600 --ldb 800 --stride_b 307200 --beta 0.0 --ldc 384 --stride_c 73728 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 384 -k 1024 --alpha 1.0 --lda 1024 --stride_a 196608 --ldb 1024 --stride_b 393216 --beta 0.0 --ldc 384 --stride_c 73728 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 2048 --alpha 1.0 --lda 2048 --stride_a 524288 --ldb 2048 --stride_b 524288 --beta 0.0 --ldc 256 --stride_c 65536 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 384 -k 2048 --alpha 1.0 --lda 2048 --stride_a 524288 --ldb 2048 --stride_b 786432 --beta 0.0 --ldc 384 --stride_c 98304 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 4096 --alpha 1.0 --lda 4096 --stride_a 1048576 --ldb 4096 --stride_b 1048576 --beta 0.0 --ldc 256 --stride_c 65536 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 384 -k 4096 --alpha 1.0 --lda 4096 --stride_a 1048576 --ldb 4096 --stride_b 1572864 --beta 0.0 --ldc 384 --stride_c 98304 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 800 --alpha 1.0 --lda 800 --stride_a 204800 --ldb 800 --stride_b 204800 --beta 0.0 --ldc 256 --stride_c 65536 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 384 -k 800 --alpha 1.0 --lda 800 --stride_a 204800 --ldb 800 --stride_b 307200 --beta 0.0 --ldc 384 --stride_c 98304 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 1024 --alpha 1.0 --lda 1024 --stride_a 262144 --ldb 1024 --stride_b 262144 --beta 0.0 --ldc 256 --stride_c 65536 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 384 -k 1024 --alpha 1.0 --lda 1024 --stride_a 262144 --ldb 1024 --stride_b 393216 --beta 0.0 --ldc 384 --stride_c 98304 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 384 -n 256 -k 2048 --alpha 1.0 --lda 2048 --stride_a 786432 --ldb 2048 --stride_b 524288 --beta 0.0 --ldc 256 --stride_c 98304 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 384 -n 384 -k 2048 --alpha 1.0 --lda 2048 --stride_a 786432 --ldb 2048 --stride_b 786432 --beta 0.0 --ldc 384 --stride_c 147456 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 384 -n 256 -k 4096 --alpha 1.0 --lda 4096 --stride_a 1572864 --ldb 4096 --stride_b 1048576 --beta 0.0 --ldc 256 --stride_c 98304 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 384 -n 384 -k 4096 --alpha 1.0 --lda 4096 --stride_a 1572864 --ldb 4096 --stride_b 1572864 --beta 0.0 --ldc 384 --stride_c 147456 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 384 -n 256 -k 800 --alpha 1.0 --lda 800 --stride_a 307200 --ldb 800 --stride_b 204800 --beta 0.0 --ldc 256 --stride_c 98304 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 384 -n 384 -k 800 --alpha 1.0 --lda 800 --stride_a 307200 --ldb 800 --stride_b 307200 --beta 0.0 --ldc 384 --stride_c 147456 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 384 -n 256 -k 1024 --alpha 1.0 --lda 1024 --stride_a 393216 --ldb 1024 --stride_b 262144 --beta 0.0 --ldc 256 --stride_c 98304 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 384 -n 384 -k 1024 --alpha 1.0 --lda 1024 --stride_a 393216 --ldb 1024 --stride_b 393216 --beta 0.0 --ldc 384 --stride_c 147456 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 112 -n 224 -k 2048 --alpha 1.0 --lda 2048 --stride_a 229376 --ldb 2048 --stride_b 458752 --beta 0.0 --ldc 224 --stride_c 25088 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 16 --alpha 1.0 --lda 16 --stride_a 2048 --ldb 16 --stride_b 4096 --beta 0.0 --ldc 256 --stride_c 32768 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 2048 --alpha 1.0 --lda 2048 --stride_a 262144 --ldb 2048 --stride_b 524288 --beta 0.0 --ldc 256 --stride_c 32768 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 512 --alpha 1.0 --lda 512 --stride_a 65536 --ldb 512 --stride_b 65536 --beta 0.0 --ldc 128 --stride_c 16384 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 160 -k 512 --alpha 1.0 --lda 512 --stride_a 65536 --ldb 512 --stride_b 81920 --beta 0.0 --ldc 160 --stride_c 20480 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 192 -k 512 --alpha 1.0 --lda 512 --stride_a 65536 --ldb 512 --stride_b 98304 --beta 0.0 --ldc 192 --stride_c 24576 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 128 --alpha 1.0 --lda 128 --stride_a 16384 --ldb 128 --stride_b 32768 --beta 0.0 --ldc 256 --stride_c 32768 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 144 -n 288 -k 2048 --alpha 1.0 --lda 2048 --stride_a 294912 --ldb 2048 --stride_b 589824 --beta 0.0 --ldc 288 --stride_c 41472 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 160 -n 320 -k 2048 --alpha 1.0 --lda 2048 --stride_a 327680 --ldb 2048 --stride_b 655360 --beta 0.0 --ldc 320 --stride_c 51200 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 160 -n 160 -k 512 --alpha 1.0 --lda 512 --stride_a 81920 --ldb 512 --stride_b 81920 --beta 0.0 --ldc 160 --stride_c 25600 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 160 -n 192 -k 512 --alpha 1.0 --lda 512 --stride_a 81920 --ldb 512 --stride_b 98304 --beta 0.0 --ldc 192 --stride_c 30720 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 192 -k 512 --alpha 1.0 --lda 512 --stride_a 98304 --ldb 512 --stride_b 98304 --beta 0.0 --ldc 192 --stride_c 36864 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 256 -k 512 --alpha 1.0 --lda 512 --stride_a 98304 --ldb 512 --stride_b 131072 --beta 0.0 --ldc 256 --stride_c 49152 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 16 --alpha 1.0 --lda 16 --stride_a 4096 --ldb 16 --stride_b 4096 --beta 0.0 --ldc 256 --stride_c 65536 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 256 --alpha 1.0 --lda 256 --stride_a 65536 --ldb 256 --stride_b 65536 --beta 0.0 --ldc 256 --stride_c 65536 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 32 --alpha 1.0 --lda 32 --stride_a 8192 --ldb 32 --stride_b 8192 --beta 0.0 --ldc 256 --stride_c 65536 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 512 --alpha 1.0 --lda 512 --stride_a 131072 --ldb 512 --stride_b 131072 --beta 0.0 --ldc 256 --stride_c 65536 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 128 --alpha 1.0 --lda 128 --stride_a 32768 --ldb 128 --stride_b 32768 --beta 0.0 --ldc 256 --stride_c 65536 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 16 --alpha 1.0 --lda 16 --stride_a 8192 --ldb 16 --stride_b 8192 --beta 0.0 --ldc 512 --stride_c 262144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 256 --alpha 1.0 --lda 256 --stride_a 131072 --ldb 256 --stride_b 131072 --beta 0.0 --ldc 512 --stride_c 262144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 32 --alpha 1.0 --lda 32 --stride_a 16384 --ldb 32 --stride_b 16384 --beta 0.0 --ldc 512 --stride_c 262144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 512 --alpha 1.0 --lda 512 --stride_a 262144 --ldb 512 --stride_b 262144 --beta 0.0 --ldc 512 --stride_c 262144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 1024 --alpha 1.0 --lda 1024 --stride_a 524288 --ldb 1024 --stride_b 524288 --beta 0.0 --ldc 512 --stride_c 262144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 128 --alpha 1.0 --lda 128 --stride_a 65536 --ldb 128 --stride_b 65536 --beta 0.0 --ldc 512 --stride_c 262144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 96 -k 512 --alpha 1.0 --lda 512 --stride_a 32768 --ldb 512 --stride_b 49152 --beta 0.0 --ldc 96 --stride_c 6144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 208 -k 2048 --alpha 1.0 --lda 2048 --stride_a 196608 --ldb 2048 --stride_b 425984 --beta 0.0 --ldc 208 --stride_c 19968 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 128 -k 512 --alpha 1.0 --lda 512 --stride_a 49152 --ldb 512 --stride_b 65536 --beta 0.0 --ldc 128 --stride_c 12288 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 32 -n 64 -k 43808 --alpha 1.0 --lda 43808 --stride_a 1401856 --ldb 43808 --stride_b 2803712 --beta 0.0 --ldc 64 --stride_c 2048 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 32 -n 32 -k 43808 --alpha 1.0 --lda 43808 --stride_a 1401856 --ldb 43808 --stride_b 1401856 --beta 0.0 --ldc 32 --stride_c 1024 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 440 --alpha 1.0 --lda 440 --stride_a 56320 --ldb 440 --stride_b 56320 --beta 0.0 --ldc 128 --stride_c 16384 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 7040 --alpha 1.0 --lda 7040 --stride_a 901120 --ldb 7040 --stride_b 901120 --beta 0.0 --ldc 128 --stride_c 16384 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 880 --alpha 1.0 --lda 880 --stride_a 112640 --ldb 880 --stride_b 112640 --beta 0.0 --ldc 128 --stride_c 16384 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 24 -k 800 --alpha 1.0 --lda 800 --stride_a 409600 --ldb 800 --stride_b 19200 --beta 0.0 --ldc 24 --stride_c 12288 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 486 -k 800 --alpha 1.0 --lda 800 --stride_a 409600 --ldb 800 --stride_b 388800 --beta 0.0 --ldc 486 --stride_c 248832 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 3 -n 64 -k 3136 --alpha 1.0 --lda 3136 --stride_a 9408 --ldb 3136 --stride_b 200704 --beta 0.0 --ldc 64 --stride_c 192 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 3 -n 64 -k 50176 --alpha 1.0 --lda 50176 --stride_a 150528 --ldb 50176 --stride_b 3211264 --beta 0.0 --ldc 64 --stride_c 192 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 3 -n 64 -k 6272 --alpha 1.0 --lda 6272 --stride_a 18816 --ldb 6272 --stride_b 401408 --beta 0.0 --ldc 64 --stride_c 192 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 3 -n 64 -k 100352 --alpha 1.0 --lda 100352 --stride_a 301056 --ldb 100352 --stride_b 6422528 --beta 0.0 --ldc 64 --stride_c 192 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 3 -n 64 -k 200704 --alpha 1.0 --lda 200704 --stride_a 602112 --ldb 200704 --stride_b 12845056 --beta 0.0 --ldc 64 --stride_c 192 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 3 -n 64 -k 25088 --alpha 1.0 --lda 25088 --stride_a 75264 --ldb 25088 --stride_b 1605632 --beta 0.0 --ldc 64 --stride_c 192 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 16 -n 32 -k 360 --alpha 1.0 --lda 360 --stride_a 5760 --ldb 360 --stride_b 11520 --beta 0.0 --ldc 32 --stride_c 512 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 16 -n 32 -k 5760 --alpha 1.0 --lda 5760 --stride_a 92160 --ldb 5760 --stride_b 184320 --beta 0.0 --ldc 32 --stride_c 512 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 192 -k 2048 --alpha 1.0 --lda 2048 --stride_a 262144 --ldb 2048 --stride_b 393216 --beta 0.0 --ldc 192 --stride_c 24576 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 128 -k 2048 --alpha 1.0 --lda 2048 --stride_a 196608 --ldb 2048 --stride_b 262144 --beta 0.0 --ldc 128 --stride_c 12288 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 49 --alpha 1.0 --lda 49 --stride_a 6272 --ldb 49 --stride_b 6272 --beta 0.0 --ldc 128 --stride_c 16384 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 392 --alpha 1.0 --lda 392 --stride_a 50176 --ldb 392 --stride_b 50176 --beta 0.0 --ldc 128 --stride_c 16384 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 192 -k 6272 --alpha 1.0 --lda 6272 --stride_a 802816 --ldb 6272 --stride_b 1204224 --beta 0.0 --ldc 192 --stride_c 24576 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 784 --alpha 1.0 --lda 784 --stride_a 100352 --ldb 784 --stride_b 100352 --beta 0.0 --ldc 128 --stride_c 16384 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 98 --alpha 1.0 --lda 98 --stride_a 12544 --ldb 98 --stride_b 12544 --beta 0.0 --ldc 128 --stride_c 16384 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 1568 --alpha 1.0 --lda 1568 --stride_a 200704 --ldb 1568 --stride_b 200704 --beta 0.0 --ldc 128 --stride_c 16384 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 192 -k 1568 --alpha 1.0 --lda 1568 --stride_a 200704 --ldb 1568 --stride_b 301056 --beta 0.0 --ldc 192 --stride_c 24576 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 3136 --alpha 1.0 --lda 3136 --stride_a 401408 --ldb 3136 --stride_b 401408 --beta 0.0 --ldc 128 --stride_c 16384 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 49 --alpha 1.0 --lda 49 --stride_a 12544 --ldb 49 --stride_b 25088 --beta 0.0 --ldc 512 --stride_c 131072 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 784 --alpha 1.0 --lda 784 --stride_a 200704 --ldb 784 --stride_b 401408 --beta 0.0 --ldc 512 --stride_c 131072 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 98 --alpha 1.0 --lda 98 --stride_a 25088 --ldb 98 --stride_b 50176 --beta 0.0 --ldc 512 --stride_c 131072 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 1568 --alpha 1.0 --lda 1568 --stride_a 401408 --ldb 1568 --stride_b 802816 --beta 0.0 --ldc 512 --stride_c 131072 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 3136 --alpha 1.0 --lda 3136 --stride_a 802816 --ldb 3136 --stride_b 1605632 --beta 0.0 --ldc 512 --stride_c 131072 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 392 --alpha 1.0 --lda 392 --stride_a 100352 --ldb 392 --stride_b 200704 --beta 0.0 --ldc 512 --stride_c 131072 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 1568 --alpha 1.0 --lda 1568 --stride_a 802816 --ldb 1568 --stride_b 802816 --beta 0.0 --ldc 512 --stride_c 262144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 3136 --alpha 1.0 --lda 3136 --stride_a 1605632 --ldb 3136 --stride_b 1605632 --beta 0.0 --ldc 512 --stride_c 262144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 1568 --alpha 1.0 --lda 1568 --stride_a 100352 --ldb 1568 --stride_b 100352 --beta 0.0 --ldc 64 --stride_c 4096 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 96 -k 1568 --alpha 1.0 --lda 1568 --stride_a 100352 --ldb 1568 --stride_b 150528 --beta 0.0 --ldc 96 --stride_c 6144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 128 -k 6272 --alpha 1.0 --lda 6272 --stride_a 602112 --ldb 6272 --stride_b 802816 --beta 0.0 --ldc 128 --stride_c 12288 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 128 -k 1568 --alpha 1.0 --lda 1568 --stride_a 150528 --ldb 1568 --stride_b 200704 --beta 0.0 --ldc 128 --stride_c 12288 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 96 -k 1568 --alpha 1.0 --lda 1568 --stride_a 150528 --ldb 1568 --stride_b 150528 --beta 0.0 --ldc 96 --stride_c 9216 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 16 -k 32 --alpha 1.0 --lda 32 --stride_a 8192 --ldb 32 --stride_b 512 --beta 0.0 --ldc 16 --stride_c 4096 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 324 -k 32 --alpha 1.0 --lda 32 --stride_a 8192 --ldb 32 --stride_b 10368 --beta 0.0 --ldc 324 --stride_c 82944 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 1760 --alpha 1.0 --lda 1760 --stride_a 112640 --ldb 1760 --stride_b 112640 --beta 0.0 --ldc 64 --stride_c 4096 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 28160 --alpha 1.0 --lda 28160 --stride_a 1802240 --ldb 28160 --stride_b 1802240 --beta 0.0 --ldc 64 --stride_c 4096 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 3520 --alpha 1.0 --lda 3520 --stride_a 225280 --ldb 3520 --stride_b 225280 --beta 0.0 --ldc 64 --stride_c 4096 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 224 -k 2592 --alpha 1.0 --lda 2592 --stride_a 497664 --ldb 2592 --stride_b 580608 --beta 0.0 --ldc 224 --stride_c 43008 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 96 -k 2592 --alpha 1.0 --lda 2592 --stride_a 165888 --ldb 2592 --stride_b 248832 --beta 0.0 --ldc 96 --stride_c 6144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 96 -k 2592 --alpha 1.0 --lda 2592 --stride_a 248832 --ldb 2592 --stride_b 248832 --beta 0.0 --ldc 96 --stride_c 9216 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 3200 --alpha 1.0 --lda 3200 --stride_a 409600 --ldb 3200 --stride_b 409600 --beta 0.0 --ldc 128 --stride_c 16384 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 3200 --alpha 1.0 --lda 3200 --stride_a 409600 --ldb 3200 --stride_b 819200 --beta 0.0 --ldc 256 --stride_c 32768 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 16 -k 3200 --alpha 1.0 --lda 3200 --stride_a 819200 --ldb 3200 --stride_b 51200 --beta 0.0 --ldc 16 --stride_c 4096 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 3200 --alpha 1.0 --lda 3200 --stride_a 819200 --ldb 3200 --stride_b 819200 --beta 0.0 --ldc 256 --stride_c 65536 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 324 -k 3200 --alpha 1.0 --lda 3200 --stride_a 819200 --ldb 3200 --stride_b 1036800 --beta 0.0 --ldc 324 --stride_c 82944 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 33 --alpha 1.0 --lda 33 --stride_a 16896 --ldb 33 --stride_b 16896 --beta 0.0 --ldc 512 --stride_c 262144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 528 --alpha 1.0 --lda 528 --stride_a 270336 --ldb 528 --stride_b 270336 --beta 0.0 --ldc 512 --stride_c 262144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 66 --alpha 1.0 --lda 66 --stride_a 33792 --ldb 66 --stride_b 33792 --beta 0.0 --ldc 512 --stride_c 262144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 1 -n 16 -k 1440 --alpha 1.0 --lda 1440 --stride_a 1440 --ldb 1440 --stride_b 23040 --beta 0.0 --ldc 16 --stride_c 16 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 1 -n 16 -k 23040 --alpha 1.0 --lda 23040 --stride_a 23040 --ldb 23040 --stride_b 368640 --beta 0.0 --ldc 16 --stride_c 16 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 192 -k 8192 --alpha 1.0 --lda 8192 --stride_a 524288 --ldb 8192 --stride_b 1572864 --beta 0.0 --ldc 192 --stride_c 12288 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 196 --alpha 1.0 --lda 196 --stride_a 12544 --ldb 196 --stride_b 12544 --beta 0.0 --ldc 64 --stride_c 4096 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 32 --alpha 1.0 --lda 32 --stride_a 4096 --ldb 32 --stride_b 8192 --beta 0.0 --ldc 256 --stride_c 32768 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 24 -k 128 --alpha 1.0 --lda 128 --stride_a 32768 --ldb 128 --stride_b 3072 --beta 0.0 --ldc 24 --stride_c 6144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 486 -k 128 --alpha 1.0 --lda 128 --stride_a 32768 --ldb 128 --stride_b 62208 --beta 0.0 --ldc 486 --stride_c 124416 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 6272 --alpha 1.0 --lda 6272 --stride_a 401408 --ldb 6272 --stride_b 401408 --beta 0.0 --ldc 64 --stride_c 4096 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 12544 --alpha 1.0 --lda 12544 --stride_a 802816 --ldb 12544 --stride_b 802816 --beta 0.0 --ldc 64 --stride_c 4096 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 196 --alpha 1.0 --lda 196 --stride_a 25088 --ldb 196 --stride_b 50176 --beta 0.0 --ldc 256 --stride_c 32768 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 3136 --alpha 1.0 --lda 3136 --stride_a 401408 --ldb 3136 --stride_b 802816 --beta 0.0 --ldc 256 --stride_c 32768 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 392 --alpha 1.0 --lda 392 --stride_a 50176 --ldb 392 --stride_b 100352 --beta 0.0 --ldc 256 --stride_c 32768 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 6272 --alpha 1.0 --lda 6272 --stride_a 802816 --ldb 6272 --stride_b 1605632 --beta 0.0 --ldc 256 --stride_c 32768 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 12544 --alpha 1.0 --lda 12544 --stride_a 1605632 --ldb 12544 --stride_b 3211264 --beta 0.0 --ldc 256 --stride_c 32768 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 1568 --alpha 1.0 --lda 1568 --stride_a 200704 --ldb 1568 --stride_b 401408 --beta 0.0 --ldc 256 --stride_c 32768 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 6272 --alpha 1.0 --lda 6272 --stride_a 1605632 --ldb 6272 --stride_b 1605632 --beta 0.0 --ldc 256 --stride_c 65536 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 12544 --alpha 1.0 --lda 12544 --stride_a 3211264 --ldb 12544 --stride_b 3211264 --beta 0.0 --ldc 256 --stride_c 65536 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 192 -k 25088 --alpha 1.0 --lda 25088 --stride_a 1605632 --ldb 25088 --stride_b 4816896 --beta 0.0 --ldc 192 --stride_c 12288 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 3136 --alpha 1.0 --lda 3136 --stride_a 200704 --ldb 3136 --stride_b 200704 --beta 0.0 --ldc 64 --stride_c 4096 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 392 --alpha 1.0 --lda 392 --stride_a 25088 --ldb 392 --stride_b 25088 --beta 0.0 --ldc 64 --stride_c 4096 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 192 -k 6272 --alpha 1.0 --lda 6272 --stride_a 401408 --ldb 6272 --stride_b 1204224 --beta 0.0 --ldc 192 --stride_c 12288 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 30 --alpha 1.0 --lda 30 --stride_a 1920 --ldb 30 --stride_b 3840 --beta 0.0 --ldc 128 --stride_c 8192 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 480 --alpha 1.0 --lda 480 --stride_a 30720 --ldb 480 --stride_b 61440 --beta 0.0 --ldc 128 --stride_c 8192 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 147 -n 64 -k 18816 --alpha 1.0 --lda 18816 --stride_a 2765952 --ldb 18816 --stride_b 1204224 --beta 0.0 --ldc 64 --stride_c 9408 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 149 -n 32 -k 19072 --alpha 1.0 --lda 19072 --stride_a 2841728 --ldb 19072 --stride_b 610304 --beta 0.0 --ldc 32 --stride_c 4768 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 8 -n 384 -k 14336 --alpha 1.0 --lda 14336 --stride_a 114688 --ldb 14336 --stride_b 5505024 --beta 0.0 --ldc 384 --stride_c 3072 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 35 -n 96 -k 8960 --alpha 1.0 --lda 8960 --stride_a 313600 --ldb 8960 --stride_b 860160 --beta 0.0 --ldc 96 --stride_c 3360 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 73 -n 192 -k 23360 --alpha 1.0 --lda 23360 --stride_a 1705280 --ldb 23360 --stride_b 4485120 --beta 0.0 --ldc 192 --stride_c 14016 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 35 -n 96 -k 13440 --alpha 1.0 --lda 13440 --stride_a 470400 --ldb 13440 --stride_b 1290240 --beta 0.0 --ldc 96 --stride_c 3360 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 96 -k 10368 --alpha 1.0 --lda 10368 --stride_a 663552 --ldb 10368 --stride_b 995328 --beta 0.0 --ldc 96 --stride_c 6144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 80 -n 192 -k 10368 --alpha 1.0 --lda 10368 --stride_a 829440 --ldb 10368 --stride_b 1990656 --beta 0.0 --ldc 192 --stride_c 15360 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 11552 --alpha 1.0 --lda 11552 --stride_a 739328 --ldb 11552 --stride_b 739328 --beta 0.0 --ldc 64 --stride_c 4096 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 160 -n 224 -k 128 --alpha 1.0 --lda 128 --stride_a 20480 --ldb 128 --stride_b 28672 --beta 0.0 --ldc 224 --stride_c 35840 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 160 -n 320 -k 128 --alpha 1.0 --lda 128 --stride_a 20480 --ldb 128 --stride_b 40960 --beta 0.0 --ldc 320 --stride_c 51200 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 224 -k 128 --alpha 1.0 --lda 128 --stride_a 24576 --ldb 128 --stride_b 28672 --beta 0.0 --ldc 224 --stride_c 43008 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 320 -k 128 --alpha 1.0 --lda 128 --stride_a 24576 --ldb 128 --stride_b 40960 --beta 0.0 --ldc 320 --stride_c 61440 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 384 -k 128 --alpha 1.0 --lda 128 --stride_a 24576 --ldb 128 --stride_b 49152 --beta 0.0 --ldc 384 --stride_c 73728 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 224 -n 224 -k 128 --alpha 1.0 --lda 128 --stride_a 28672 --ldb 128 --stride_b 28672 --beta 0.0 --ldc 224 --stride_c 50176 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 4 --alpha 1.0 --lda 4 --stride_a 1024 --ldb 4 --stride_b 2048 --beta 0.0 --ldc 512 --stride_c 131072 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 32 --alpha 1.0 --lda 32 --stride_a 8192 --ldb 32 --stride_b 16384 --beta 0.0 --ldc 512 --stride_c 131072 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 4 --alpha 1.0 --lda 4 --stride_a 2048 --ldb 4 --stride_b 2048 --beta 0.0 --ldc 512 --stride_c 262144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 64 --alpha 1.0 --lda 64 --stride_a 32768 --ldb 64 --stride_b 32768 --beta 0.0 --ldc 512 --stride_c 262144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 8 --alpha 1.0 --lda 8 --stride_a 4096 --ldb 8 --stride_b 4096 --beta 0.0 --ldc 512 --stride_c 262144 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 105 --alpha 1.0 --lda 105 --stride_a 26880 --ldb 105 --stride_b 26880 --beta 0.0 --ldc 256 --stride_c 65536 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 1680 --alpha 1.0 --lda 1680 --stride_a 430080 --ldb 1680 --stride_b 430080 --beta 0.0 --ldc 256 --stride_c 65536 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 210 --alpha 1.0 --lda 210 --stride_a 53760 --ldb 210 --stride_b 53760 --beta 0.0 --ldc 256 --stride_c 65536 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 448 -n 384 -k 128 --alpha 1.0 --lda 128 --stride_a 57344 --ldb 128 --stride_b 49152 --beta 0.0 --ldc 384 --stride_c 172032 --batch 36 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 24 -k 128 --alpha 1.0 --lda 128 --stride_a 65536 --ldb 128 --stride_b 3072 --beta 0.0 --ldc 24 --stride_c 12288 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 486 -k 128 --alpha 1.0 --lda 128 --stride_a 65536 --ldb 128 --stride_b 62208 --beta 0.0 --ldc 486 --stride_c 248832 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 361 --alpha 1.0 --lda 361 --stride_a 23104 --ldb 361 --stride_b 46208 --beta 0.0 --ldc 128 --stride_c 8192 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 5776 --alpha 1.0 --lda 5776 --stride_a 369664 --ldb 5776 --stride_b 739328 --beta 0.0 --ldc 128 --stride_c 8192 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 722 --alpha 1.0 --lda 722 --stride_a 46208 --ldb 722 --stride_b 92416 --beta 0.0 --ldc 128 --stride_c 8192 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 11552 --alpha 1.0 --lda 11552 --stride_a 739328 --ldb 11552 --stride_b 1478656 --beta 0.0 --ldc 128 --stride_c 8192 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 23104 --alpha 1.0 --lda 23104 --stride_a 1478656 --ldb 23104 --stride_b 2957312 --beta 0.0 --ldc 128 --stride_c 8192 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 2888 --alpha 1.0 --lda 2888 --stride_a 184832 --ldb 2888 --stride_b 369664 --beta 0.0 --ldc 128 --stride_c 8192 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 32 -n 64 -k 40 --alpha 1.0 --lda 40 --stride_a 1280 --ldb 40 --stride_b 2560 --beta 0.0 --ldc 64 --stride_c 2048 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 32 -n 64 -k 640 --alpha 1.0 --lda 640 --stride_a 20480 --ldb 640 --stride_b 40960 --beta 0.0 --ldc 64 --stride_c 2048 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 112 -n 224 -k 242 --alpha 1.0 --lda 242 --stride_a 27104 --ldb 242 --stride_b 54208 --beta 0.0 --ldc 224 --stride_c 25088 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 242 --alpha 1.0 --lda 242 --stride_a 30976 --ldb 242 --stride_b 61952 --beta 0.0 --ldc 256 --stride_c 32768 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 144 -n 288 -k 242 --alpha 1.0 --lda 242 --stride_a 34848 --ldb 242 --stride_b 69696 --beta 0.0 --ldc 288 --stride_c 41472 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 160 -n 320 -k 242 --alpha 1.0 --lda 242 --stride_a 38720 --ldb 242 --stride_b 77440 --beta 0.0 --ldc 320 --stride_c 51200 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 384 -k 242 --alpha 1.0 --lda 242 --stride_a 46464 --ldb 242 --stride_b 92928 --beta 0.0 --ldc 384 --stride_c 73728 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 208 -k 242 --alpha 1.0 --lda 242 --stride_a 23232 --ldb 242 --stride_b 50336 --beta 0.0 --ldc 208 --stride_c 19968 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 384 -k 1152 --alpha 1.0 --lda 1152 --stride_a 221184 --ldb 1152 --stride_b 442368 --beta 0.0 --ldc 384 --stride_c 73728 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 384 -k 2304 --alpha 1.0 --lda 2304 --stride_a 442368 --ldb 2304 --stride_b 884736 --beta 0.0 --ldc 384 --stride_c 73728 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 384 -k 450 --alpha 1.0 --lda 450 --stride_a 86400 --ldb 450 --stride_b 172800 --beta 0.0 --ldc 384 --stride_c 73728 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 384 -k 576 --alpha 1.0 --lda 576 --stride_a 110592 --ldb 576 --stride_b 221184 --beta 0.0 --ldc 384 --stride_c 73728 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 1152 --alpha 1.0 --lda 1152 --stride_a 294912 --ldb 1152 --stride_b 294912 --beta 0.0 --ldc 256 --stride_c 65536 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 384 -k 1152 --alpha 1.0 --lda 1152 --stride_a 294912 --ldb 1152 --stride_b 442368 --beta 0.0 --ldc 384 --stride_c 98304 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 2304 --alpha 1.0 --lda 2304 --stride_a 589824 --ldb 2304 --stride_b 589824 --beta 0.0 --ldc 256 --stride_c 65536 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 384 -k 2304 --alpha 1.0 --lda 2304 --stride_a 589824 --ldb 2304 --stride_b 884736 --beta 0.0 --ldc 384 --stride_c 98304 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 450 --alpha 1.0 --lda 450 --stride_a 115200 --ldb 450 --stride_b 115200 --beta 0.0 --ldc 256 --stride_c 65536 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 384 -k 450 --alpha 1.0 --lda 450 --stride_a 115200 --ldb 450 --stride_b 172800 --beta 0.0 --ldc 384 --stride_c 98304 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 576 --alpha 1.0 --lda 576 --stride_a 147456 --ldb 576 --stride_b 147456 --beta 0.0 --ldc 256 --stride_c 65536 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 384 -k 576 --alpha 1.0 --lda 576 --stride_a 147456 --ldb 576 --stride_b 221184 --beta 0.0 --ldc 384 --stride_c 98304 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 384 -n 256 -k 1152 --alpha 1.0 --lda 1152 --stride_a 442368 --ldb 1152 --stride_b 294912 --beta 0.0 --ldc 256 --stride_c 98304 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 384 -n 384 -k 1152 --alpha 1.0 --lda 1152 --stride_a 442368 --ldb 1152 --stride_b 442368 --beta 0.0 --ldc 384 --stride_c 147456 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 384 -n 256 -k 2304 --alpha 1.0 --lda 2304 --stride_a 884736 --ldb 2304 --stride_b 589824 --beta 0.0 --ldc 256 --stride_c 98304 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 384 -n 384 -k 2304 --alpha 1.0 --lda 2304 --stride_a 884736 --ldb 2304 --stride_b 884736 --beta 0.0 --ldc 384 --stride_c 147456 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 384 -n 256 -k 450 --alpha 1.0 --lda 450 --stride_a 172800 --ldb 450 --stride_b 115200 --beta 0.0 --ldc 256 --stride_c 98304 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 384 -n 384 -k 450 --alpha 1.0 --lda 450 --stride_a 172800 --ldb 450 --stride_b 172800 --beta 0.0 --ldc 384 --stride_c 147456 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 384 -n 256 -k 576 --alpha 1.0 --lda 576 --stride_a 221184 --ldb 576 --stride_b 147456 --beta 0.0 --ldc 256 --stride_c 98304 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 384 -n 384 -k 576 --alpha 1.0 --lda 576 --stride_a 221184 --ldb 576 --stride_b 221184 --beta 0.0 --ldc 384 --stride_c 147456 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 112 -n 224 -k 1152 --alpha 1.0 --lda 1152 --stride_a 129024 --ldb 1152 --stride_b 258048 --beta 0.0 --ldc 224 --stride_c 25088 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 112 -n 224 -k 288 --alpha 1.0 --lda 288 --stride_a 32256 --ldb 288 --stride_b 64512 --beta 0.0 --ldc 224 --stride_c 25088 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 9 --alpha 1.0 --lda 9 --stride_a 1152 --ldb 9 --stride_b 2304 --beta 0.0 --ldc 256 --stride_c 32768 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 1152 --alpha 1.0 --lda 1152 --stride_a 147456 --ldb 1152 --stride_b 294912 --beta 0.0 --ldc 256 --stride_c 32768 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 288 --alpha 1.0 --lda 288 --stride_a 36864 --ldb 288 --stride_b 36864 --beta 0.0 --ldc 128 --stride_c 16384 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 160 -k 288 --alpha 1.0 --lda 288 --stride_a 36864 --ldb 288 --stride_b 46080 --beta 0.0 --ldc 160 --stride_c 20480 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 192 -k 288 --alpha 1.0 --lda 288 --stride_a 36864 --ldb 288 --stride_b 55296 --beta 0.0 --ldc 192 --stride_c 24576 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 288 --alpha 1.0 --lda 288 --stride_a 36864 --ldb 288 --stride_b 73728 --beta 0.0 --ldc 256 --stride_c 32768 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 72 --alpha 1.0 --lda 72 --stride_a 9216 --ldb 72 --stride_b 18432 --beta 0.0 --ldc 256 --stride_c 32768 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 144 -n 288 -k 1152 --alpha 1.0 --lda 1152 --stride_a 165888 --ldb 1152 --stride_b 331776 --beta 0.0 --ldc 288 --stride_c 41472 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 144 -n 288 -k 288 --alpha 1.0 --lda 288 --stride_a 41472 --ldb 288 --stride_b 82944 --beta 0.0 --ldc 288 --stride_c 41472 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 160 -n 320 -k 1152 --alpha 1.0 --lda 1152 --stride_a 184320 --ldb 1152 --stride_b 368640 --beta 0.0 --ldc 320 --stride_c 51200 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 160 -n 160 -k 288 --alpha 1.0 --lda 288 --stride_a 46080 --ldb 288 --stride_b 46080 --beta 0.0 --ldc 160 --stride_c 25600 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 160 -n 192 -k 288 --alpha 1.0 --lda 288 --stride_a 46080 --ldb 288 --stride_b 55296 --beta 0.0 --ldc 192 --stride_c 30720 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 160 -n 320 -k 288 --alpha 1.0 --lda 288 --stride_a 46080 --ldb 288 --stride_b 92160 --beta 0.0 --ldc 320 --stride_c 51200 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 192 -k 288 --alpha 1.0 --lda 288 --stride_a 55296 --ldb 288 --stride_b 55296 --beta 0.0 --ldc 192 --stride_c 36864 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 256 -k 288 --alpha 1.0 --lda 288 --stride_a 55296 --ldb 288 --stride_b 73728 --beta 0.0 --ldc 256 --stride_c 49152 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 9 --alpha 1.0 --lda 9 --stride_a 2304 --ldb 9 --stride_b 2304 --beta 0.0 --ldc 256 --stride_c 65536 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 144 --alpha 1.0 --lda 144 --stride_a 36864 --ldb 144 --stride_b 36864 --beta 0.0 --ldc 256 --stride_c 65536 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 18 --alpha 1.0 --lda 18 --stride_a 4608 --ldb 18 --stride_b 4608 --beta 0.0 --ldc 256 --stride_c 65536 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 288 --alpha 1.0 --lda 288 --stride_a 73728 --ldb 288 --stride_b 73728 --beta 0.0 --ldc 256 --stride_c 65536 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 72 --alpha 1.0 --lda 72 --stride_a 18432 --ldb 72 --stride_b 18432 --beta 0.0 --ldc 256 --stride_c 65536 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 9 --alpha 1.0 --lda 9 --stride_a 4608 --ldb 9 --stride_b 4608 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 144 --alpha 1.0 --lda 144 --stride_a 73728 --ldb 144 --stride_b 73728 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 18 --alpha 1.0 --lda 18 --stride_a 9216 --ldb 18 --stride_b 9216 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 288 --alpha 1.0 --lda 288 --stride_a 147456 --ldb 288 --stride_b 147456 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 576 --alpha 1.0 --lda 576 --stride_a 294912 --ldb 576 --stride_b 294912 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 72 --alpha 1.0 --lda 72 --stride_a 36864 --ldb 72 --stride_b 36864 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 96 -k 288 --alpha 1.0 --lda 288 --stride_a 18432 --ldb 288 --stride_b 27648 --beta 0.0 --ldc 96 --stride_c 6144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 208 -k 1152 --alpha 1.0 --lda 1152 --stride_a 110592 --ldb 1152 --stride_b 239616 --beta 0.0 --ldc 208 --stride_c 19968 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 128 -k 288 --alpha 1.0 --lda 288 --stride_a 27648 --ldb 288 --stride_b 36864 --beta 0.0 --ldc 128 --stride_c 12288 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 208 -k 288 --alpha 1.0 --lda 288 --stride_a 27648 --ldb 288 --stride_b 59904 --beta 0.0 --ldc 208 --stride_c 19968 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 32 -n 64 -k 20000 --alpha 1.0 --lda 20000 --stride_a 640000 --ldb 20000 --stride_b 1280000 --beta 0.0 --ldc 64 --stride_c 2048 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 32 -n 32 -k 20000 --alpha 1.0 --lda 20000 --stride_a 640000 --ldb 20000 --stride_b 640000 --beta 0.0 --ldc 32 --stride_c 1024 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 210 --alpha 1.0 --lda 210 --stride_a 26880 --ldb 210 --stride_b 26880 --beta 0.0 --ldc 128 --stride_c 16384 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 3360 --alpha 1.0 --lda 3360 --stride_a 430080 --ldb 3360 --stride_b 430080 --beta 0.0 --ldc 128 --stride_c 16384 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 420 --alpha 1.0 --lda 420 --stride_a 53760 --ldb 420 --stride_b 53760 --beta 0.0 --ldc 128 --stride_c 16384 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 24 -k 512 --alpha 1.0 --lda 512 --stride_a 262144 --ldb 512 --stride_b 12288 --beta 0.0 --ldc 24 --stride_c 12288 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 486 -k 512 --alpha 1.0 --lda 512 --stride_a 262144 --ldb 512 --stride_b 248832 --beta 0.0 --ldc 486 --stride_c 248832 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 3 -n 64 -k 1444 --alpha 1.0 --lda 1444 --stride_a 4332 --ldb 1444 --stride_b 92416 --beta 0.0 --ldc 64 --stride_c 192 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 3 -n 64 -k 23104 --alpha 1.0 --lda 23104 --stride_a 69312 --ldb 23104 --stride_b 1478656 --beta 0.0 --ldc 64 --stride_c 192 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 3 -n 64 -k 2888 --alpha 1.0 --lda 2888 --stride_a 8664 --ldb 2888 --stride_b 184832 --beta 0.0 --ldc 64 --stride_c 192 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 3 -n 64 -k 46208 --alpha 1.0 --lda 46208 --stride_a 138624 --ldb 46208 --stride_b 2957312 --beta 0.0 --ldc 64 --stride_c 192 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 3 -n 64 -k 92416 --alpha 1.0 --lda 92416 --stride_a 277248 --ldb 92416 --stride_b 5914624 --beta 0.0 --ldc 64 --stride_c 192 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 3 -n 64 -k 11552 --alpha 1.0 --lda 11552 --stride_a 34656 --ldb 11552 --stride_b 739328 --beta 0.0 --ldc 64 --stride_c 192 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 16 -n 32 -k 160 --alpha 1.0 --lda 160 --stride_a 2560 --ldb 160 --stride_b 5120 --beta 0.0 --ldc 32 --stride_c 512 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 16 -n 32 -k 2560 --alpha 1.0 --lda 2560 --stride_a 40960 --ldb 2560 --stride_b 81920 --beta 0.0 --ldc 32 --stride_c 512 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 192 -k 946 --alpha 1.0 --lda 946 --stride_a 121088 --ldb 946 --stride_b 181632 --beta 0.0 --ldc 192 --stride_c 24576 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 128 -k 946 --alpha 1.0 --lda 946 --stride_a 90816 --ldb 946 --stride_b 121088 --beta 0.0 --ldc 128 --stride_c 12288 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 25 --alpha 1.0 --lda 25 --stride_a 3200 --ldb 25 --stride_b 3200 --beta 0.0 --ldc 128 --stride_c 16384 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 200 --alpha 1.0 --lda 200 --stride_a 25600 --ldb 200 --stride_b 25600 --beta 0.0 --ldc 128 --stride_c 16384 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 192 -k 3200 --alpha 1.0 --lda 3200 --stride_a 409600 --ldb 3200 --stride_b 614400 --beta 0.0 --ldc 192 --stride_c 24576 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 400 --alpha 1.0 --lda 400 --stride_a 51200 --ldb 400 --stride_b 51200 --beta 0.0 --ldc 128 --stride_c 16384 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 50 --alpha 1.0 --lda 50 --stride_a 6400 --ldb 50 --stride_b 6400 --beta 0.0 --ldc 128 --stride_c 16384 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 800 --alpha 1.0 --lda 800 --stride_a 102400 --ldb 800 --stride_b 102400 --beta 0.0 --ldc 128 --stride_c 16384 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 192 -k 800 --alpha 1.0 --lda 800 --stride_a 102400 --ldb 800 --stride_b 153600 --beta 0.0 --ldc 192 --stride_c 24576 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 1600 --alpha 1.0 --lda 1600 --stride_a 204800 --ldb 1600 --stride_b 204800 --beta 0.0 --ldc 128 --stride_c 16384 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 25 --alpha 1.0 --lda 25 --stride_a 6400 --ldb 25 --stride_b 12800 --beta 0.0 --ldc 512 --stride_c 131072 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 400 --alpha 1.0 --lda 400 --stride_a 102400 --ldb 400 --stride_b 204800 --beta 0.0 --ldc 512 --stride_c 131072 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 50 --alpha 1.0 --lda 50 --stride_a 12800 --ldb 50 --stride_b 25600 --beta 0.0 --ldc 512 --stride_c 131072 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 800 --alpha 1.0 --lda 800 --stride_a 204800 --ldb 800 --stride_b 409600 --beta 0.0 --ldc 512 --stride_c 131072 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 1600 --alpha 1.0 --lda 1600 --stride_a 409600 --ldb 1600 --stride_b 819200 --beta 0.0 --ldc 512 --stride_c 131072 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 200 --alpha 1.0 --lda 200 --stride_a 51200 --ldb 200 --stride_b 102400 --beta 0.0 --ldc 512 --stride_c 131072 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 800 --alpha 1.0 --lda 800 --stride_a 409600 --ldb 800 --stride_b 409600 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 1600 --alpha 1.0 --lda 1600 --stride_a 819200 --ldb 1600 --stride_b 819200 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 800 --alpha 1.0 --lda 800 --stride_a 51200 --ldb 800 --stride_b 51200 --beta 0.0 --ldc 64 --stride_c 4096 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 96 -k 800 --alpha 1.0 --lda 800 --stride_a 51200 --ldb 800 --stride_b 76800 --beta 0.0 --ldc 96 --stride_c 6144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 128 -k 3200 --alpha 1.0 --lda 3200 --stride_a 307200 --ldb 3200 --stride_b 409600 --beta 0.0 --ldc 128 --stride_c 12288 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 128 -k 800 --alpha 1.0 --lda 800 --stride_a 76800 --ldb 800 --stride_b 102400 --beta 0.0 --ldc 128 --stride_c 12288 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 96 -k 800 --alpha 1.0 --lda 800 --stride_a 76800 --ldb 800 --stride_b 76800 --beta 0.0 --ldc 96 --stride_c 9216 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 16 -k 32 --alpha 1.0 --lda 32 --stride_a 8192 --ldb 32 --stride_b 512 --beta 0.0 --ldc 16 --stride_c 4096 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 324 -k 32 --alpha 1.0 --lda 32 --stride_a 8192 --ldb 32 --stride_b 10368 --beta 0.0 --ldc 324 --stride_c 82944 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 826 --alpha 1.0 --lda 826 --stride_a 52864 --ldb 826 --stride_b 52864 --beta 0.0 --ldc 64 --stride_c 4096 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 13216 --alpha 1.0 --lda 13216 --stride_a 845824 --ldb 13216 --stride_b 845824 --beta 0.0 --ldc 64 --stride_c 4096 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 1652 --alpha 1.0 --lda 1652 --stride_a 105728 --ldb 1652 --stride_b 105728 --beta 0.0 --ldc 64 --stride_c 4096 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 224 -k 1152 --alpha 1.0 --lda 1152 --stride_a 221184 --ldb 1152 --stride_b 258048 --beta 0.0 --ldc 224 --stride_c 43008 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 96 -k 1152 --alpha 1.0 --lda 1152 --stride_a 73728 --ldb 1152 --stride_b 110592 --beta 0.0 --ldc 96 --stride_c 6144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 96 -n 96 -k 1152 --alpha 1.0 --lda 1152 --stride_a 110592 --ldb 1152 --stride_b 110592 --beta 0.0 --ldc 96 --stride_c 9216 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 128 -k 1568 --alpha 1.0 --lda 1568 --stride_a 200704 --ldb 1568 --stride_b 200704 --beta 0.0 --ldc 128 --stride_c 16384 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 1568 --alpha 1.0 --lda 1568 --stride_a 200704 --ldb 1568 --stride_b 401408 --beta 0.0 --ldc 256 --stride_c 32768 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 16 -k 1568 --alpha 1.0 --lda 1568 --stride_a 401408 --ldb 1568 --stride_b 25088 --beta 0.0 --ldc 16 --stride_c 4096 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 1568 --alpha 1.0 --lda 1568 --stride_a 401408 --ldb 1568 --stride_b 401408 --beta 0.0 --ldc 256 --stride_c 65536 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 324 -k 1568 --alpha 1.0 --lda 1568 --stride_a 401408 --ldb 1568 --stride_b 508032 --beta 0.0 --ldc 324 --stride_c 82944 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 14 --alpha 1.0 --lda 14 --stride_a 7168 --ldb 14 --stride_b 7168 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 224 --alpha 1.0 --lda 224 --stride_a 114688 --ldb 224 --stride_b 114688 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 28 --alpha 1.0 --lda 28 --stride_a 14336 --ldb 28 --stride_b 14336 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 1 -n 16 -k 640 --alpha 1.0 --lda 640 --stride_a 640 --ldb 640 --stride_b 10240 --beta 0.0 --ldc 16 --stride_c 16 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 1 -n 16 -k 10240 --alpha 1.0 --lda 10240 --stride_a 10240 --ldb 10240 --stride_b 163840 --beta 0.0 --ldc 16 --stride_c 16 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 192 -k 3698 --alpha 1.0 --lda 3698 --stride_a 236672 --ldb 3698 --stride_b 710016 --beta 0.0 --ldc 192 --stride_c 12288 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 81 --alpha 1.0 --lda 81 --stride_a 5184 --ldb 81 --stride_b 5184 --beta 0.0 --ldc 64 --stride_c 4096 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 648 --alpha 1.0 --lda 648 --stride_a 41472 --ldb 648 --stride_b 41472 --beta 0.0 --ldc 64 --stride_c 4096 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 32 --alpha 1.0 --lda 32 --stride_a 4096 --ldb 32 --stride_b 8192 --beta 0.0 --ldc 256 --stride_c 32768 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 24 -k 32 --alpha 1.0 --lda 32 --stride_a 8192 --ldb 32 --stride_b 768 --beta 0.0 --ldc 24 --stride_c 6144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 486 -k 32 --alpha 1.0 --lda 32 --stride_a 8192 --ldb 32 --stride_b 15552 --beta 0.0 --ldc 486 --stride_c 124416 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 3200 --alpha 1.0 --lda 3200 --stride_a 204800 --ldb 3200 --stride_b 204800 --beta 0.0 --ldc 64 --stride_c 4096 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 6400 --alpha 1.0 --lda 6400 --stride_a 409600 --ldb 6400 --stride_b 409600 --beta 0.0 --ldc 64 --stride_c 4096 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 100 --alpha 1.0 --lda 100 --stride_a 12800 --ldb 100 --stride_b 25600 --beta 0.0 --ldc 256 --stride_c 32768 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 1600 --alpha 1.0 --lda 1600 --stride_a 204800 --ldb 1600 --stride_b 409600 --beta 0.0 --ldc 256 --stride_c 32768 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 200 --alpha 1.0 --lda 200 --stride_a 25600 --ldb 200 --stride_b 51200 --beta 0.0 --ldc 256 --stride_c 32768 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 3200 --alpha 1.0 --lda 3200 --stride_a 409600 --ldb 3200 --stride_b 819200 --beta 0.0 --ldc 256 --stride_c 32768 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 6400 --alpha 1.0 --lda 6400 --stride_a 819200 --ldb 6400 --stride_b 1638400 --beta 0.0 --ldc 256 --stride_c 32768 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 128 -n 256 -k 800 --alpha 1.0 --lda 800 --stride_a 102400 --ldb 800 --stride_b 204800 --beta 0.0 --ldc 256 --stride_c 32768 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 3200 --alpha 1.0 --lda 3200 --stride_a 819200 --ldb 3200 --stride_b 819200 --beta 0.0 --ldc 256 --stride_c 65536 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 6400 --alpha 1.0 --lda 6400 --stride_a 1638400 --ldb 6400 --stride_b 1638400 --beta 0.0 --ldc 256 --stride_c 65536 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 192 -k 12800 --alpha 1.0 --lda 12800 --stride_a 819200 --ldb 12800 --stride_b 2457600 --beta 0.0 --ldc 192 --stride_c 12288 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 100 --alpha 1.0 --lda 100 --stride_a 6400 --ldb 100 --stride_b 6400 --beta 0.0 --ldc 64 --stride_c 4096 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 1600 --alpha 1.0 --lda 1600 --stride_a 102400 --ldb 1600 --stride_b 102400 --beta 0.0 --ldc 64 --stride_c 4096 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 200 --alpha 1.0 --lda 200 --stride_a 12800 --ldb 200 --stride_b 12800 --beta 0.0 --ldc 64 --stride_c 4096 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 192 -k 3200 --alpha 1.0 --lda 3200 --stride_a 204800 --ldb 3200 --stride_b 614400 --beta 0.0 --ldc 192 --stride_c 12288 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 10 --alpha 1.0 --lda 10 --stride_a 640 --ldb 10 --stride_b 1280 --beta 0.0 --ldc 128 --stride_c 8192 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 128 -k 160 --alpha 1.0 --lda 160 --stride_a 10240 --ldb 160 --stride_b 20480 --beta 0.0 --ldc 128 --stride_c 8192 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 147 -n 64 -k 9702 --alpha 1.0 --lda 9702 --stride_a 1426194 --ldb 9702 --stride_b 620928 --beta 0.0 --ldc 64 --stride_c 9408 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 149 -n 32 -k 8195 --alpha 1.0 --lda 8195 --stride_a 1221055 --ldb 8195 --stride_b 262240 --beta 0.0 --ldc 32 --stride_c 4768 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 8 -n 384 -k 6600 --alpha 1.0 --lda 6600 --stride_a 52800 --ldb 6600 --stride_b 2534400 --beta 0.0 --ldc 384 --stride_c 3072 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 35 -n 96 -k 4235 --alpha 1.0 --lda 4235 --stride_a 148225 --ldb 4235 --stride_b 406560 --beta 0.0 --ldc 96 --stride_c 3360 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 73 -n 192 -k 10439 --alpha 1.0 --lda 10439 --stride_a 762047 --ldb 10439 --stride_b 2004288 --beta 0.0 --ldc 192 --stride_c 14016 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 35 -n 96 -k 6160 --alpha 1.0 --lda 6160 --stride_a 215600 --ldb 6160 --stride_b 591360 --beta 0.0 --ldc 96 --stride_c 3360 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 96 -k 4608 --alpha 1.0 --lda 4608 --stride_a 294912 --ldb 4608 --stride_b 442368 --beta 0.0 --ldc 96 --stride_c 6144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 80 -n 192 -k 4608 --alpha 1.0 --lda 4608 --stride_a 368640 --ldb 4608 --stride_b 884736 --beta 0.0 --ldc 192 --stride_c 15360 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 64 -k 5408 --alpha 1.0 --lda 5408 --stride_a 346112 --ldb 5408 --stride_b 346112 --beta 0.0 --ldc 64 --stride_c 4096 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 160 -n 320 -k 512 --alpha 1.0 --lda 512 --stride_a 81920 --ldb 512 --stride_b 163840 --beta 0.0 --ldc 320 --stride_c 51200 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 160 -n 224 -k 128 --alpha 1.0 --lda 128 --stride_a 20480 --ldb 128 --stride_b 28672 --beta 0.0 --ldc 224 --stride_c 35840 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 160 -n 320 -k 128 --alpha 1.0 --lda 128 --stride_a 20480 --ldb 128 --stride_b 40960 --beta 0.0 --ldc 320 --stride_c 51200 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 384 -k 512 --alpha 1.0 --lda 512 --stride_a 98304 --ldb 512 --stride_b 196608 --beta 0.0 --ldc 384 --stride_c 73728 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 224 -k 128 --alpha 1.0 --lda 128 --stride_a 24576 --ldb 128 --stride_b 28672 --beta 0.0 --ldc 224 --stride_c 43008 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 320 -k 128 --alpha 1.0 --lda 128 --stride_a 24576 --ldb 128 --stride_b 40960 --beta 0.0 --ldc 320 --stride_c 61440 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 192 -n 384 -k 128 --alpha 1.0 --lda 128 --stride_a 24576 --ldb 128 --stride_b 49152 --beta 0.0 --ldc 384 --stride_c 73728 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 224 -n 224 -k 128 --alpha 1.0 --lda 128 --stride_a 28672 --ldb 128 --stride_b 28672 --beta 0.0 --ldc 224 --stride_c 50176 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 4 --alpha 1.0 --lda 4 --stride_a 1024 --ldb 4 --stride_b 2048 --beta 0.0 --ldc 512 --stride_c 131072 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 32 --alpha 1.0 --lda 32 --stride_a 8192 --ldb 32 --stride_b 16384 --beta 0.0 --ldc 512 --stride_c 131072 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 4 --alpha 1.0 --lda 4 --stride_a 2048 --ldb 4 --stride_b 2048 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 64 --alpha 1.0 --lda 64 --stride_a 32768 --ldb 64 --stride_b 32768 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 8 --alpha 1.0 --lda 8 --stride_a 4096 --ldb 8 --stride_b 4096 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 128 --alpha 1.0 --lda 128 --stride_a 65536 --ldb 128 --stride_b 65536 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 256 --alpha 1.0 --lda 256 --stride_a 131072 --ldb 256 --stride_b 131072 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 32 --alpha 1.0 --lda 32 --stride_a 16384 --ldb 32 --stride_b 16384 --beta 0.0 --ldc 512 --stride_c 262144 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 56 --alpha 1.0 --lda 56 --stride_a 14336 --ldb 56 --stride_b 14336 --beta 0.0 --ldc 256 --stride_c 65536 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 896 --alpha 1.0 --lda 896 --stride_a 229376 --ldb 896 --stride_b 229376 --beta 0.0 --ldc 256 --stride_c 65536 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 256 -n 256 -k 112 --alpha 1.0 --lda 112 --stride_a 28672 --ldb 112 --stride_b 28672 --beta 0.0 --ldc 256 --stride_c 65536 --batch 64 +./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 448 -n 384 -k 128 --alpha 1.0 --lda 128 --stride_a 57344 --ldb 128 --stride_b 49152 --beta 0.0 --ldc 384 --stride_c 172032 --batch 64 From e1c911cd62886e8b202c4cf29f37835da3c5ea2c Mon Sep 17 00:00:00 2001 From: wgilmart Date: Tue, 6 Aug 2019 15:39:48 -0700 Subject: [PATCH 2/2] replicate new logic files to asm_ci --- .../asm_ci/vega20_Cijk_Ailk_Bjlk_SB.yaml | 64568 +++++++++++++-- .../asm_ci/vega20_Cijk_Ailk_Bljk_SB.yaml | 68626 ++++++++++------ .../asm_ci/vega20_Cijk_Alik_Bljk_SB.yaml | 54380 ++++++++---- 3 files changed, 140395 insertions(+), 47179 deletions(-) diff --git a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bjlk_SB.yaml index 6d020d6fa..b440f65ce 100644 --- a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bjlk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bjlk_SB.yaml @@ -16658,8 +16658,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -16822,8 +16822,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -16982,8 +16982,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17146,8 +17146,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17306,8 +17306,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17470,8 +17470,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17630,8 +17630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17790,8 +17790,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -17950,8 +17950,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18114,8 +18114,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18274,8 +18274,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18434,8 +18434,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18594,8 +18594,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18758,8 +18758,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -18925,8 +18925,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19086,8 +19086,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19247,8 +19247,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19412,8 +19412,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19573,8 +19573,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19734,8 +19734,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -19895,8 +19895,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20056,8 +20056,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20221,8 +20221,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20386,8 +20386,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20547,8 +20547,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20708,8 +20708,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -20869,8 +20869,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21030,8 +21030,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21191,8 +21191,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21352,8 +21352,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21513,8 +21513,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21674,8 +21674,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21835,8 +21835,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -21996,8 +21996,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22157,8 +22157,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22322,8 +22322,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22487,8 +22487,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22650,8 +22650,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22817,8 +22817,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -22982,8 +22982,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23145,8 +23145,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23312,8 +23312,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23475,8 +23475,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23642,8 +23642,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23807,8 +23807,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -23970,8 +23970,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24137,8 +24137,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24300,8 +24300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24467,8 +24467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24630,8 +24630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24797,8 +24797,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -24966,8 +24966,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -25133,8 +25133,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -25298,8 +25298,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -25347,11 +25347,11 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -25362,8 +25362,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -25371,31 +25371,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 832 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -25409,10 +25406,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -25420,26 +25417,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -25449,6 +25454,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -25458,6 +25464,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -25472,35 +25479,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 166 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id007 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -25511,40 +25526,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -25558,10 +25570,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -25569,26 +25581,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -25598,6 +25618,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -25607,6 +25628,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -25621,35 +25643,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 167 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -25659,41 +25689,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -25707,10 +25738,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -25718,19 +25749,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -25738,6 +25776,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -25747,6 +25786,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -25756,6 +25796,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -25770,79 +25811,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 168 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -25855,11 +25905,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -25869,17 +25919,24 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -25887,6 +25944,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -25896,6 +25954,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -25905,6 +25964,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -25919,35 +25979,42607 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 169 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW1_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR0_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 235 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW2_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 237 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO1_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 238 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 242 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 243 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 244 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 245 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 246 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 247 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 248 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 249 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 250 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 251 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 252 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 253 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 254 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 255 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 256 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 257 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 258 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 259 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 260 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 261 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 262 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 263 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 264 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 265 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 266 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 267 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 268 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 269 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 270 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 271 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 272 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 273 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 274 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 275 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW4_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 276 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 277 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 278 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 279 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 280 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 281 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 282 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 283 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 284 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 285 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 286 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 287 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 288 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 289 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 290 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 291 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 292 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 293 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 294 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 295 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 296 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_8_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 297 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 298 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 299 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 300 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 301 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 302 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 303 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 304 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 305 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 306 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 307 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 308 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 309 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 310 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 311 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 312 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 313 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 314 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 315 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 316 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 317 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 318 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id007 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 319 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 384 + LdsNumElementsAlignedB: 384 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 384 + LdsOffsetB_Blk: 1408 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 320 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x24_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 321 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 322 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 323 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 324 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 325 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 326 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 327 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 328 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 329 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 330 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 331 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id012 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 332 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id012 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 333 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 334 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 335 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 336 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 337 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 338 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 339 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id012 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 340 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 341 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 342 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 343 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 344 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 345 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 346 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 347 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 348 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 349 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 350 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id012 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 351 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 352 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 353 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 354 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 355 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 356 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 357 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 358 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 359 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 360 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 361 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 362 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id013 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 363 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 364 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 365 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL0_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 366 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id017 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 367 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 368 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 369 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id017 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 370 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 371 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id016 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 372 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 373 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 374 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 375 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 376 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id024 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 377 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 378 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id026 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 379 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 380 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 381 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 382 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id024 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 383 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 384 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id026 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 385 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id020 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 386 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id027 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 2 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 387 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id027 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 388 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 389 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id031 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 390 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 391 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id030 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 392 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 393 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id030 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 394 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 395 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 396 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id029 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 397 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id028 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id031 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 398 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 399 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 256 + LSPA: 8 + LSPB: 1 + LVCA: 32 + LVCB: 256 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 400 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 256 + LSPA: 8 + LSPB: 1 + LVCA: 32 + LVCB: 256 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 401 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM08 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 256 + LSPA: 8 + LSPB: 1 + LVCA: 32 + LVCB: 256 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 402 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 64 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 403 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 404 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 405 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 406 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 407 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id033 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 408 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 409 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 410 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM08 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 411 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM08 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id032 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id035 + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 412 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 413 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 414 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 415 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 416 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 417 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 418 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 419 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 420 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 421 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 422 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 423 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 424 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 425 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 426 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 427 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 428 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 429 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 430 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 431 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 640 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 432 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 640 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 433 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 434 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 435 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 436 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id002 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -25958,8 +68590,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -25967,31 +68599,188 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 768 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 437 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 2 LSPB: 8 - LVCA: 16 + LVCA: 64 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26005,9 +68794,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -26016,26 +68805,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26045,6 +68840,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26054,6 +68850,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26068,47 +68865,216 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 170 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 438 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 439 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -26116,31 +69082,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 4 + LSPA: 2 LSPB: 8 - LVCA: 32 + LVCA: 64 LVCB: 16 LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26153,7 +69115,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 16 MacroTileA: 64 @@ -26166,25 +69128,31 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26194,6 +69162,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26203,6 +69172,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26217,79 +69187,83 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 171 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 440 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 + KernelLanguage: Assembly + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 2 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26302,10 +69276,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -26314,26 +69288,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26343,6 +69323,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26352,6 +69333,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26366,79 +69348,83 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 172 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 441 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 LSPA: 4 LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26451,11 +69437,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -26464,25 +69450,31 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26492,6 +69484,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26501,6 +69494,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26515,35 +69509,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 173 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 442 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -26554,40 +69556,36 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26601,9 +69599,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -26614,24 +69612,30 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26641,6 +69645,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26650,6 +69655,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26664,35 +69670,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 174 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 443 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -26703,8 +69717,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -26712,31 +69726,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26750,10 +69760,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -26761,26 +69771,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26790,6 +69806,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26799,6 +69816,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26813,35 +69831,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 175 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 444 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -26852,8 +69878,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -26861,31 +69887,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 32 - LVCB: 16 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 LVPA: 2 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -26900,9 +69922,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -26910,26 +69932,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -26939,6 +69967,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -26948,6 +69977,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -26962,47 +69992,55 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 176 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 445 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -27010,31 +70048,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 LVPA: 2 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -27047,11 +70081,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27059,26 +70093,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27088,6 +70128,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27097,6 +70138,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27111,79 +70153,87 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 177 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 446 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -27196,11 +70246,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27208,15 +70258,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -27228,6 +70283,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27237,6 +70293,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27246,6 +70303,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27260,35 +70318,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 178 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 447 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -27298,41 +70364,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 4 - LSPB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 2 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -27346,10 +70412,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27357,15 +70423,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -27377,6 +70448,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27386,6 +70458,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27395,6 +70468,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27409,35 +70483,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 179 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 448 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -27447,8 +70529,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -27465,23 +70547,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -27495,10 +70577,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27508,13 +70590,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -27526,6 +70611,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27535,6 +70621,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27544,6 +70631,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27558,35 +70646,45 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 180 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 449 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -27596,41 +70694,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -27645,9 +70743,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27655,15 +70753,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -27675,6 +70778,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27684,6 +70788,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27693,6 +70798,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27707,35 +70813,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 181 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 450 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id004 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -27745,8 +70859,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -27763,23 +70877,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 LSPA: 8 LSPB: 16 - LVCA: 16 - LVCB: 8 + LVCA: 32 + LVCB: 16 LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -27793,10 +70907,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27804,15 +70918,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -27824,6 +70943,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27833,6 +70953,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27842,6 +70963,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -27856,35 +70978,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 182 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 451 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id004 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -27894,41 +71024,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 8 - LSPA: 4 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 LSPB: 16 LVCA: 32 - LVCB: 8 + LVCB: 16 LVPA: 2 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -27942,10 +71072,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -27953,15 +71083,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -27973,6 +71106,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -27982,6 +71116,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -27991,6 +71126,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28005,35 +71141,45 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 183 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 452 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -28043,41 +71189,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 16 - LVCB: 4 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 2 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -28091,10 +71237,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28102,15 +71248,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -28122,6 +71273,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28131,6 +71283,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28140,6 +71293,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28154,35 +71308,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 184 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 453 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id007 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -28192,41 +71354,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -28240,10 +71402,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28251,15 +71413,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -28271,6 +71436,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28280,6 +71446,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28289,6 +71456,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28303,79 +71471,89 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 185 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 454 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 384 - LdsNumElementsAlignedB: 384 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 384 - LdsOffsetB_Blk: 1408 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -28388,11 +71566,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28400,15 +71578,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 3 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -28420,6 +71603,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28429,6 +71613,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28438,6 +71623,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28452,39 +71638,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 186 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x24_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 455 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -28492,56 +71686,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 + LSCA: 64 + LSCB: 128 + LSPA: 16 LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28549,15 +71743,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -28569,6 +71768,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28578,6 +71778,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28587,6 +71788,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28601,96 +71803,104 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 187 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 456 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 + LSCA: 64 + LSCB: 128 + LSPA: 16 LSPB: 8 LVCA: 16 - LVCB: 16 + LVCB: 32 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28698,15 +71908,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -28718,6 +71931,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28727,6 +71941,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28736,6 +71951,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28750,39 +71966,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 188 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 457 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -28790,56 +72016,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 128 + LSCB: 64 LSPA: 8 - LSPB: 8 - LVCA: 16 + LSPB: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28847,15 +72073,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -28867,6 +72098,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -28876,6 +72108,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -28885,6 +72118,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -28899,96 +72133,104 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 189 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 458 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 128 + LSCB: 64 LSPA: 8 - LSPB: 8 - LVCA: 16 + LSPB: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -28996,15 +72238,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -29016,6 +72261,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29025,6 +72271,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29034,6 +72281,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29048,33 +72296,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 190 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 459 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -29088,56 +72346,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -29145,15 +72403,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -29165,6 +72428,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29174,6 +72438,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29183,6 +72448,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29197,33 +72463,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 191 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 460 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -29236,57 +72510,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 128 + LSCB: 128 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -29294,15 +72568,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -29314,6 +72591,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29323,6 +72601,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29332,6 +72611,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29346,33 +72626,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 192 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 461 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -29385,7 +72675,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -29395,47 +72685,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -29443,26 +72734,31 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29472,6 +72768,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29481,6 +72778,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29495,14 +72793,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 193 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 462 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -29513,21 +72818,24 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -29535,45 +72843,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 128 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -29582,9 +72891,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -29592,19 +72901,25 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -29612,6 +72927,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29621,6 +72937,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29630,6 +72947,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29644,39 +72962,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 194 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 463 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -29692,48 +73018,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -29741,26 +73068,33 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29770,6 +73104,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29779,6 +73114,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29793,48 +73129,56 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 195 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 464 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -29842,47 +73186,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -29890,19 +73235,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -29910,6 +73259,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -29919,6 +73269,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -29928,6 +73279,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -29942,47 +73294,57 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 196 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 465 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -29990,37 +73352,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -30028,10 +73387,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30039,26 +73398,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30068,6 +73435,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30077,6 +73445,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30091,47 +73460,55 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 197 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 466 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id012 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -30139,37 +73516,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -30178,9 +73552,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30188,26 +73562,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30217,6 +73599,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30226,6 +73609,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30240,39 +73624,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 198 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 467 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id012 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -30280,7 +73672,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -30288,48 +73680,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30337,19 +73730,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -30357,6 +73757,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30366,6 +73767,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30375,6 +73777,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30389,33 +73792,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 199 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 468 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -30429,56 +73840,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30486,19 +73898,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -30506,6 +73925,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30515,6 +73935,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30524,6 +73945,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30538,33 +73960,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 200 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 469 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -30578,56 +74008,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30635,19 +74066,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -30655,6 +74093,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30664,6 +74103,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30673,6 +74113,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30687,39 +74128,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 201 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 470 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -30727,45 +74176,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -30774,9 +74224,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30784,26 +74234,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30813,6 +74271,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30822,6 +74281,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30836,45 +74296,53 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 202 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 471 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -30884,31 +74352,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -30921,11 +74386,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -30935,24 +74400,32 @@ NonTemporalC: 0 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -30962,6 +74435,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -30971,6 +74445,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -30985,14 +74460,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 203 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 472 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 @@ -31003,15 +74485,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -31024,9 +74507,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -31034,26 +74517,27 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 8 LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 16 + LVCB: 4 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 1664 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -31072,9 +74556,9 @@ LoopTail: true LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31082,26 +74566,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31111,6 +74601,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31120,6 +74611,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31134,17 +74626,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 204 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 473 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -31152,30 +74651,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -31183,36 +74685,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -31220,10 +74723,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31233,17 +74736,22 @@ NonTemporalC: 0 NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31251,6 +74759,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31260,6 +74769,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31269,6 +74779,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31283,17 +74794,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 205 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 474 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -31301,30 +74819,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id012 - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -31332,47 +74853,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31380,19 +74902,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31400,6 +74927,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31409,6 +74937,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31418,6 +74947,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31432,85 +74962,96 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 206 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 475 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -31518,10 +75059,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31529,19 +75070,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31549,6 +75095,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31558,6 +75105,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31567,6 +75115,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31581,48 +75130,58 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 207 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 476 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -31630,47 +75189,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31678,19 +75238,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31698,6 +75263,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31707,6 +75273,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31716,6 +75283,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31730,39 +75298,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 208 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 477 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -31778,48 +75356,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31827,26 +75406,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -31856,6 +75443,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -31865,6 +75453,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -31879,39 +75468,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 209 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 478 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -31920,7 +75517,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -31928,36 +75525,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -31965,10 +75563,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -31976,19 +75574,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -31996,6 +75601,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32005,6 +75611,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32014,6 +75621,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32028,46 +75636,54 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 210 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 479 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -32077,22 +75693,23 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 @@ -32106,18 +75723,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -32125,19 +75742,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32145,6 +75767,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32154,6 +75777,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32163,6 +75787,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32177,39 +75802,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 211 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 480 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -32225,23 +75860,24 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -32255,7 +75891,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -32263,10 +75899,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -32274,8 +75910,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -32283,10 +75919,17 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32294,6 +75937,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32303,6 +75947,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32312,6 +75957,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32326,39 +75972,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 212 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 481 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -32366,35 +76020,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -32404,18 +76059,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -32423,19 +76078,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32443,6 +76105,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32452,6 +76115,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32461,6 +76125,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32475,39 +76140,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 213 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 482 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -32515,55 +76188,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -32572,8 +76246,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -32581,10 +76255,17 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32592,6 +76273,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32601,6 +76283,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32610,6 +76293,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32624,14 +76308,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 214 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 483 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -32641,16 +76332,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -32664,39 +76356,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 8 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -32710,9 +76403,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -32721,19 +76414,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32741,6 +76441,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32750,6 +76451,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32759,6 +76461,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32773,33 +76476,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 215 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 484 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id011 + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -32813,55 +76524,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -32870,19 +76582,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -32890,6 +76609,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -32899,6 +76619,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -32908,6 +76629,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -32922,33 +76644,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 216 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 485 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id012 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -32962,39 +76692,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -33008,9 +76739,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -33019,19 +76750,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -33039,6 +76777,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33048,6 +76787,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33057,6 +76797,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33071,33 +76812,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 217 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 486 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -33111,35 +76860,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 16 + LSPB: 32 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3328 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -33149,18 +76899,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -33168,19 +76918,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -33188,6 +76945,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33197,6 +76955,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33206,6 +76965,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33220,39 +76980,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 218 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 487 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW1_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 + VectorWidth: 1 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -33261,7 +77029,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -33269,26 +77037,27 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -33298,18 +77067,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -33317,26 +77086,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33346,6 +77123,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33355,6 +77133,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33369,14 +77148,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 219 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 488 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -33387,21 +77173,22 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id008 - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -33409,45 +77196,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -33455,10 +77243,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -33466,19 +77254,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -33486,6 +77281,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33495,6 +77291,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33504,6 +77301,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33518,39 +77316,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 220 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 489 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -33567,22 +77373,23 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 @@ -33596,7 +77403,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -33604,10 +77411,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -33615,8 +77422,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -33624,10 +77431,17 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -33635,6 +77449,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33644,6 +77459,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33653,6 +77469,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33667,75 +77484,84 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 221 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 490 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -33745,7 +77571,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -33753,37 +77579,43 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33793,6 +77625,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33802,6 +77635,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -33816,39 +77650,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 222 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 491 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id011 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -33865,74 +77709,85 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -33942,6 +77797,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -33951,8 +77807,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -33965,17 +77823,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 223 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 492 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -33983,21 +77848,22 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id011 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -34005,76 +77871,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 32 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34082,6 +77958,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34091,6 +77968,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34100,8 +77978,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -34114,116 +77994,132 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 224 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 493 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34231,6 +78127,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34240,6 +78137,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34249,8 +78147,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -34263,75 +78163,86 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 225 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 494 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -34341,38 +78252,45 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34380,6 +78298,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34389,6 +78308,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34398,8 +78318,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -34412,85 +78334,96 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 226 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 495 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -34498,37 +78431,45 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34538,6 +78479,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34547,8 +78489,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -34561,85 +78505,96 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 227 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 496 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id011 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -34647,30 +78602,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34678,6 +78640,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34687,6 +78650,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34696,8 +78660,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -34710,48 +78676,58 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 228 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id013 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 497 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id011 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -34759,36 +78735,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -34796,30 +78773,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34827,6 +78811,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34836,6 +78821,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34845,8 +78831,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -34859,39 +78847,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 229 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_04_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 498 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -34908,36 +78906,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -34945,30 +78944,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -34976,6 +78984,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -34985,6 +78994,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -34994,8 +79004,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -35008,35 +79020,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 230 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 499 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id011 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -35046,41 +79066,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 LSCB: 128 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 32 - LVCB: 32 - LVPA: 2 + LVCB: 64 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -35094,30 +79115,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35125,6 +79153,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35134,6 +79163,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35143,8 +79173,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -35157,79 +79189,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 231 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL0_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 500 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -35242,31 +79285,38 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35274,6 +79324,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35283,6 +79334,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35292,8 +79344,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -35306,79 +79360,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 232 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 501 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id017 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 128 - LSPA: 16 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 16 + LVCA: 64 LVCB: 32 - LVPA: 4 - LVPB: 2 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -35391,31 +79456,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35423,6 +79497,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35432,6 +79507,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35441,8 +79517,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -35455,79 +79533,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 233 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 502 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 128 LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -35540,7 +79627,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -35548,12 +79635,14 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -35561,10 +79650,17 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35572,6 +79668,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35581,6 +79678,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35590,8 +79688,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -35604,75 +79704,84 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 234 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 503 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -35689,31 +79798,38 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35721,6 +79837,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35730,6 +79847,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35739,8 +79857,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -35753,79 +79873,90 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 235 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 504 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id017 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -35838,31 +79969,40 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -35870,6 +80010,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -35879,6 +80020,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -35888,8 +80030,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -35902,79 +80046,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 236 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 505 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id016 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -35987,38 +80140,48 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36028,6 +80191,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36037,8 +80201,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -36051,39 +80217,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 237 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 506 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR0_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id016 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -36107,60 +80281,70 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -36168,6 +80352,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36177,6 +80362,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36186,8 +80372,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -36200,39 +80388,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 238 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 507 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -36256,60 +80452,70 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -36317,6 +80523,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36326,6 +80533,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36335,8 +80543,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -36349,46 +80559,54 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 239 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 508 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -36405,29 +80623,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -36435,30 +80654,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -36466,6 +80692,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36475,6 +80702,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36484,8 +80712,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -36498,14 +80728,21 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 240 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 509 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id022 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -36516,28 +80753,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -36554,15 +80794,16 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 @@ -36576,7 +80817,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -36584,30 +80825,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -36615,6 +80863,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36624,6 +80873,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36633,8 +80883,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -36647,39 +80899,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 241 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 510 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id023 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -36687,45 +80949,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 128 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 4 + LSPB: 32 LVCA: 16 - LVCB: 64 - LVPA: 8 - LVPB: 2 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -36733,30 +80996,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -36764,6 +81036,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36773,6 +81046,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36782,8 +81056,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -36796,46 +81072,54 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 242 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 511 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id024 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -36852,15 +81136,16 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 @@ -36874,7 +81159,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -36882,30 +81167,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -36913,6 +81205,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -36922,6 +81215,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -36931,8 +81225,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -36945,17 +81241,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 243 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 512 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -36963,15 +81266,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -36985,56 +81291,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 4 - LSPB: 16 + LSPB: 8 LVCA: 64 - LVCB: 16 - LVPA: 2 + LVCB: 32 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -37042,19 +81349,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 3 + NumLoadsB: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -37062,6 +81376,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37071,6 +81386,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37080,6 +81396,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -37094,39 +81411,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 244 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 513 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id026 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -37134,56 +81459,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -37191,19 +81517,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -37211,6 +81544,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37220,6 +81554,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37229,6 +81564,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -37243,33 +81579,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 245 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 514 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -37283,39 +81627,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -37329,10 +81674,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -37340,19 +81685,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -37360,6 +81712,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37369,6 +81722,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37378,6 +81732,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -37392,39 +81747,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 246 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 515 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -37432,39 +81795,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 16 + LSCB: 32 + LSPA: 4 LSPB: 8 - LVCA: 16 + LVCA: 64 LVCB: 32 LVPA: 4 - LVPB: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -37477,11 +81841,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -37489,19 +81853,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -37509,6 +81880,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37518,6 +81890,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37527,6 +81900,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -37541,33 +81915,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 247 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 516 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id023 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -37581,56 +81963,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 128 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 4 + LSPB: 8 LVCA: 16 - LVCB: 64 - LVPA: 8 - LVPB: 2 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -37638,19 +82021,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -37658,6 +82048,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37667,6 +82058,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37676,6 +82068,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -37690,96 +82083,105 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 248 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id024 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 517 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 8 + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 2 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -37787,19 +82189,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -37807,6 +82214,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37816,6 +82224,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37825,6 +82234,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -37839,33 +82249,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 249 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 518 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -37878,7 +82298,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -37895,40 +82315,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -37936,19 +82357,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -37956,6 +82382,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -37965,6 +82392,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -37974,6 +82402,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -37988,33 +82417,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 250 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 519 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id026 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -38028,56 +82467,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 4 LSPB: 8 - LVCA: 32 + LVCA: 64 LVCB: 32 - LVPA: 2 - LVPB: 2 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -38085,19 +82525,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38105,6 +82552,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38114,6 +82562,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38123,6 +82572,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -38137,79 +82587,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 251 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 520 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id020 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 4 - LSPB: 4 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -38222,11 +82681,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -38234,19 +82693,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38254,6 +82720,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38263,6 +82730,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38272,6 +82740,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -38286,46 +82755,54 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 252 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 521 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id027 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -38334,31 +82811,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -38371,11 +82849,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -38384,18 +82862,25 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38403,6 +82888,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38412,6 +82898,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38421,6 +82908,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -38435,39 +82923,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 253 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 522 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id027 + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -38483,48 +82979,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 4 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -38532,19 +83029,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38552,6 +83056,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38561,6 +83066,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38570,6 +83076,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -38584,33 +83091,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 254 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: true - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 523 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id029 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -38623,9 +83138,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -38636,44 +83151,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 64 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -38681,19 +83197,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38701,6 +83222,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38710,6 +83232,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38719,6 +83242,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -38733,39 +83257,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 255 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 524 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id031 - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -38773,8 +83307,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -38782,30 +83316,31 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -38818,11 +83353,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -38830,19 +83365,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38850,6 +83392,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -38859,6 +83402,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -38868,6 +83412,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -38882,79 +83427,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 256 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 525 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id029 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 4 - LSPB: 4 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -38967,11 +83521,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -38979,19 +83533,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -38999,6 +83558,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39008,6 +83568,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39017,6 +83578,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -39031,39 +83593,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 257 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 526 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id030 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -39079,31 +83651,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 32 + LSCB: 32 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 + LVCA: 32 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -39116,11 +83689,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -39128,19 +83701,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39148,6 +83728,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39157,6 +83738,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39166,6 +83748,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -39180,39 +83763,47 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 258 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 527 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id029 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -39228,31 +83819,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 4 LSPB: 4 - LVCA: 16 - LVCB: 16 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -39265,11 +83857,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -39277,19 +83869,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39297,6 +83896,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39306,6 +83906,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39315,6 +83916,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -39329,33 +83931,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 259 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 528 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id030 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -39368,57 +83978,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 64 + LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -39426,19 +84037,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39446,6 +84062,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39455,6 +84072,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39464,6 +84082,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -39478,39 +84097,49 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 260 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 529 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id029 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -39518,56 +84147,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -39575,7 +84205,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -39583,11 +84213,18 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39595,6 +84232,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39604,6 +84242,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39613,6 +84252,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -39627,33 +84267,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 261 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 530 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 2] + VectorWidth: 4 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -39667,8 +84315,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -39676,47 +84324,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 64 + LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -39724,19 +84373,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39744,6 +84400,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39753,6 +84410,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39762,6 +84420,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -39776,33 +84435,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 262 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 531 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id029 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -39815,57 +84482,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -39873,19 +84541,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -39893,6 +84566,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -39902,6 +84576,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -39911,6 +84586,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -39925,44 +84601,56 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 263 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id028 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 532 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id031 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: false + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -39979,23 +84667,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 4 - LSPB: 2 + LSPB: 4 LVCA: 64 - LVCB: 128 + LVCB: 64 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -40008,11 +84697,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -40020,17 +84709,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -40038,6 +84736,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40047,6 +84746,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40056,6 +84756,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -40070,95 +84771,105 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 264 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 533 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id032 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id035 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -40166,17 +84877,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -40184,6 +84904,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40193,6 +84914,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40202,6 +84924,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -40216,91 +84939,105 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 265 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 534 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - ThreadTile: *id032 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 1 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true + DepthU: 16 + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 + LSCA: 64 + LSCB: 32 LSPA: 8 - LSPB: 1 + LSPB: 16 LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -40308,24 +85045,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40335,6 +85082,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40344,6 +85092,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -40358,74 +85107,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 266 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 535 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id033 + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true + DepthU: 16 + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 - LSPA: 8 - LSPB: 1 - LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -40435,14 +85198,14 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -40450,24 +85213,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40477,6 +85250,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40486,6 +85260,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -40500,45 +85275,54 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 267 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM08 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id032 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 536 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW2_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id033 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -40547,44 +85331,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 256 + LSCB: 32 LSPA: 8 - LSPB: 1 + LSPB: 8 LVCA: 32 - LVCB: 256 + LVCB: 32 LVPA: 8 - LVPB: 1 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 + LdsOffsetA_Blk: 512 LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 256 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 256 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -40592,24 +85381,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40619,6 +85418,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40628,6 +85428,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -40642,33 +85443,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 268 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM64 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 537 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO1_VW2_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id032 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -40680,7 +85489,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -40697,36 +85507,41 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 + LSCB: 32 + LSPA: 4 + LSPB: 8 LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -40734,24 +85549,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40761,6 +85586,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40770,6 +85596,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -40784,78 +85611,88 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 269 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 538 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - ThreadTile: *id032 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -40868,7 +85705,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -40881,16 +85718,25 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -40898,6 +85744,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -40907,6 +85754,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -40916,6 +85764,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -40930,44 +85779,53 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 270 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 539 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id036 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -40985,39 +85843,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 2 - LVPB: 2 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -41026,17 +85885,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -41044,6 +85910,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41053,6 +85920,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41062,6 +85930,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -41076,33 +85945,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 271 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 540 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -41114,40 +85993,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -41161,10 +86042,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -41174,15 +86055,24 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -41190,6 +86080,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41199,6 +86090,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41208,6 +86100,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -41222,44 +86115,53 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 272 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id036 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 541 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id033 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -41277,25 +86179,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 + LVCB: 8 LVPA: 4 - LVPB: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -41304,9 +86211,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -41314,24 +86221,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41341,6 +86256,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41350,6 +86266,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -41364,16 +86281,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 273 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: [8, 4] - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 542 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -41381,78 +86306,82 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id033 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -41460,17 +86389,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -41478,6 +86416,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41487,6 +86426,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41496,6 +86436,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -41510,91 +86451,105 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 274 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - ThreadTile: *id036 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 543 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 1 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -41602,24 +86557,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41629,6 +86594,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41638,6 +86604,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -41652,95 +86619,105 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 275 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 544 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 1 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -41748,17 +86725,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -41766,6 +86752,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41775,6 +86762,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41784,6 +86772,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -41798,33 +86787,41 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 276 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM08 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - ThreadTile: *id036 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 545 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 1 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -41836,53 +86833,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -41890,24 +86893,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -41917,6 +86930,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -41926,6 +86940,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -41940,32 +86955,39 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 277 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM08 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - ThreadTile: *id032 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 546 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id035 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -41973,43 +86995,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 16 - LVCB: 8 - LVPA: 1 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -42022,11 +87049,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -42034,13 +87061,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42050,13 +87080,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42076,8 +87108,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42095,37 +87127,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 278 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM1 + SolutionIndex: 547 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42140,32 +87170,33 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 LSPA: 4 LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -42198,13 +87229,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42214,6 +87248,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -42221,6 +87256,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42240,8 +87276,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42259,37 +87295,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 279 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM8 + SolutionIndex: 548 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42297,49 +87331,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 4 - LSPB: 8 + LSPA: 16 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 1 - LVPB: 2 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -42347,9 +87386,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -42358,13 +87397,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42374,13 +87414,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42400,8 +87442,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42419,8 +87461,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 280 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM64 + SolutionIndex: 549 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -42429,27 +87471,27 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42457,7 +87499,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -42465,45 +87507,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 1 - LVPB: 1 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -42511,10 +87554,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -42522,13 +87565,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42538,6 +87582,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -42545,6 +87590,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42564,8 +87610,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42583,8 +87629,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 281 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM64 + SolutionIndex: 550 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -42592,28 +87638,28 @@ SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42624,57 +87670,62 @@ DepthU: 16 DirectToLds: false DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 16 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 LVCA: 32 - LVCB: 8 - LVPA: 1 + LVCB: 64 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -42682,13 +87733,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42698,13 +87752,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42724,8 +87780,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42743,37 +87799,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 282 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 551 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42788,56 +87842,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 4 + LVCB: 32 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -42846,13 +87901,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -42862,6 +87920,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -42869,6 +87928,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -42888,8 +87948,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42907,16 +87967,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 283 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 552 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -42927,17 +87987,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -42951,37 +88009,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 1 + LVCB: 32 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -42995,10 +88058,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -43006,13 +88069,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43022,13 +88088,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43048,8 +88116,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43067,37 +88135,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 284 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 553 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -43105,43 +88171,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -43154,11 +88225,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -43166,13 +88237,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43182,13 +88256,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43208,8 +88284,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43227,8 +88303,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 285 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 554 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -43237,27 +88313,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -43265,60 +88339,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -43326,13 +88405,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43342,13 +88424,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43368,8 +88452,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43387,15 +88471,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 286 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 555 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -43403,21 +88487,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -43425,39 +88507,40 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -43471,18 +88554,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -43490,13 +88573,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43506,6 +88592,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -43513,6 +88600,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43532,8 +88620,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43551,37 +88639,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 287 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 556 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -43589,43 +88675,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 1 - LVPB: 4 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -43638,11 +88729,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -43650,13 +88741,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43666,13 +88760,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43692,8 +88788,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43711,37 +88807,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 288 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 557 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -43749,43 +88843,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -43798,11 +88897,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -43810,13 +88909,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43826,13 +88928,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -43852,8 +88956,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43871,8 +88975,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 289 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 558 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -43881,23 +88985,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -43909,43 +89011,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -43958,11 +89065,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -43970,13 +89077,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -43986,13 +89096,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -44012,8 +89124,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44031,8 +89143,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 290 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 559 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -44041,27 +89153,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -44069,14 +89179,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -44093,15 +89203,16 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -44115,18 +89226,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -44134,13 +89245,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44150,6 +89264,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -44157,6 +89272,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -44176,8 +89292,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44195,16 +89311,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 291 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 560 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -44216,12 +89332,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -44233,7 +89347,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -44257,29 +89371,30 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -44287,10 +89402,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -44298,15 +89413,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44316,8 +89432,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -44343,8 +89460,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44362,20 +89479,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 292 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 561 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -44383,14 +89500,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -44404,7 +89521,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -44418,23 +89535,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 32 LSPA: 4 - LSPB: 4 + LSPB: 8 LVCA: 64 - LVCB: 64 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -44449,9 +89571,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -44459,15 +89581,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44477,7 +89600,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -44504,8 +89628,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44523,8 +89647,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 293 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 562 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -44533,17 +89657,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -44551,7 +89675,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -44559,13 +89683,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -44579,40 +89703,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 4 - LSPB: 8 + LSPB: 16 LVCA: 64 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -44620,15 +89749,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44638,7 +89768,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -44665,8 +89796,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44684,15 +89815,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 294 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 563 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] ThreadTile0: 4 @@ -44704,15 +89835,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -44727,40 +89858,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -44775,9 +89907,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -44785,15 +89917,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44803,8 +89934,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -44830,8 +89962,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44849,8 +89981,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 295 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM1 + SolutionIndex: 564 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -44858,26 +89990,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -44885,49 +90019,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -44946,15 +90085,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -44964,8 +90102,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -44991,8 +90130,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45010,8 +90149,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 296 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 565 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -45026,15 +90165,17 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -45046,13 +90187,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -45070,25 +90211,30 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -45096,10 +90242,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -45107,15 +90253,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45125,7 +90272,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -45152,8 +90300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45171,35 +90319,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 297 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 566 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 1 - WorkGroup: [16, 8, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -45213,7 +90361,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -45227,23 +90375,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 128 + LSCB: 128 LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 + LSPB: 2 + LVCA: 128 + LVCB: 128 LVPA: 2 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -45257,10 +90410,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -45268,15 +90421,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45286,8 +90440,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -45313,8 +90468,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45332,28 +90487,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 298 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 567 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -45368,13 +90523,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -45392,25 +90547,30 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 + LVCA: 16 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -45418,10 +90578,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -45429,15 +90589,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45447,7 +90608,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -45474,8 +90636,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45493,35 +90655,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 299 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 568 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 1 - WorkGroup: [16, 8, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -45537,7 +90699,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -45545,27 +90707,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 1792 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -45584,9 +90747,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -45594,15 +90757,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45612,8 +90776,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -45639,8 +90804,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45658,8 +90823,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 300 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 569 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -45667,26 +90832,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -45714,27 +90879,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 4 - LSPB: 2 + LSPB: 4 LVCA: 64 - LVCB: 128 + LVCB: 64 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -45749,9 +90915,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -45759,15 +90925,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45777,8 +90944,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -45804,8 +90972,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45823,8 +90991,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 301 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM8 + SolutionIndex: 570 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -45832,26 +91000,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -45865,37 +91033,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -45910,9 +91083,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -45921,14 +91094,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -45938,8 +91110,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -45965,8 +91138,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45984,15 +91157,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 302 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 571 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -46000,19 +91173,21 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46026,37 +91201,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -46071,9 +91251,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -46082,14 +91262,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46099,7 +91280,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -46126,8 +91308,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46145,15 +91327,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 303 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 572 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -46161,19 +91343,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46181,60 +91363,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCA: 16 + LSCB: 64 + LSPA: 32 LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -46242,15 +91429,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46260,7 +91448,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -46287,8 +91476,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46306,35 +91495,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 304 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + SolutionIndex: 573 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46342,60 +91531,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -46403,15 +91597,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46421,8 +91616,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -46448,8 +91644,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46467,35 +91663,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 305 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 574 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46503,43 +91699,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCA: 128 + LSCB: 128 + LSPA: 8 LSPB: 8 - LVCA: 64 - LVCB: 16 + LVCA: 32 + LVCB: 32 LVPA: 2 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -46552,11 +91753,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -46564,15 +91765,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46582,8 +91784,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -46609,8 +91812,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46628,35 +91831,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 306 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 575 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW4_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46670,54 +91873,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCA: 32 + LSCB: 64 + LSPA: 16 LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -46726,14 +91934,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46743,7 +91952,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -46770,8 +91980,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46789,35 +91999,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 307 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 576 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46825,13 +92035,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -46845,40 +92055,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 4 - LSPB: 8 + LSPB: 16 LVCA: 64 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -46886,15 +92101,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -46904,7 +92120,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -46931,8 +92148,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46950,15 +92167,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 308 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + SolutionIndex: 577 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] ThreadTile0: 4 @@ -46970,15 +92187,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -46986,56 +92203,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -47047,15 +92269,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47065,8 +92286,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -47092,8 +92314,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47111,8 +92333,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 309 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + SolutionIndex: 578 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -47127,19 +92349,21 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47147,56 +92371,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -47208,7 +92437,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 NumLoadsB: 2 @@ -47216,7 +92445,8 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47226,7 +92456,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -47253,8 +92484,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47272,8 +92503,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 310 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + SolutionIndex: 579 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -47288,19 +92519,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47308,49 +92539,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -47358,9 +92594,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -47369,15 +92605,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47387,8 +92622,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -47414,8 +92650,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47433,14 +92669,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 311 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + SolutionIndex: 580 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -47449,19 +92685,21 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47469,60 +92707,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -47530,15 +92773,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47548,7 +92790,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -47575,8 +92818,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47594,35 +92837,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 312 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + SolutionIndex: 581 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47630,7 +92875,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -47654,40 +92899,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -47695,15 +92941,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47713,6 +92960,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -47740,8 +92988,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47759,20 +93007,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 313 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 582 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -47780,14 +93028,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47795,7 +93043,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -47803,31 +93051,32 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -47841,18 +93090,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -47860,15 +93109,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -47878,6 +93128,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -47905,8 +93156,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47924,16 +93175,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 314 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + SolutionIndex: 583 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -47944,15 +93195,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -47960,39 +93211,40 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -48006,18 +93258,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48025,13 +93277,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48041,6 +93296,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -48068,8 +93324,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48087,33 +93343,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 315 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + SolutionIndex: 584 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -48125,7 +93379,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -48133,39 +93387,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48178,11 +93433,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48190,15 +93445,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48208,6 +93464,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -48235,8 +93492,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48254,8 +93511,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 316 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM1 + SolutionIndex: 585 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -48263,26 +93520,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -48298,39 +93555,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 - LVPA: 2 + LVCB: 32 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48344,9 +93602,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -48355,15 +93613,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48373,6 +93632,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -48400,8 +93660,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48419,8 +93679,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 317 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 586 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -48428,18 +93688,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -48447,7 +93707,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -48462,57 +93722,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 + LSCA: 64 + LSCB: 16 LSPA: 8 LSPB: 16 LVCA: 32 LVCB: 16 - LVPA: 2 - LVPB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48520,13 +93781,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48536,6 +93800,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -48563,8 +93828,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48582,37 +93847,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 318 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 587 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -48628,56 +93891,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 + LSCA: 64 + LSCB: 32 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48685,15 +93949,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48703,6 +93968,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -48730,8 +93996,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48749,35 +94015,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 319 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + SolutionIndex: 588 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -48785,47 +94051,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -48838,11 +94105,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -48850,13 +94117,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -48866,6 +94136,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -48893,8 +94164,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48912,8 +94183,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 320 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + SolutionIndex: 589 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -48921,28 +94192,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -48950,7 +94219,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -48958,31 +94227,32 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -48996,18 +94266,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49015,15 +94285,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49033,6 +94304,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -49060,8 +94332,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49079,16 +94351,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 321 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM8 + SolutionIndex: 590 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -49099,15 +94371,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49122,57 +94394,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49180,15 +94453,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49198,6 +94470,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -49225,8 +94498,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49244,35 +94517,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 322 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + SolutionIndex: 591 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49287,7 +94562,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -49304,23 +94579,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -49335,9 +94611,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49345,13 +94621,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49361,6 +94640,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -49388,8 +94668,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49407,8 +94687,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 323 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + SolutionIndex: 592 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -49416,12 +94696,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -49429,15 +94709,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49445,14 +94723,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -49469,40 +94747,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49510,15 +94789,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49528,6 +94806,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -49555,8 +94834,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49574,20 +94853,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 324 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 593 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -49595,14 +94874,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49610,7 +94891,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -49618,45 +94899,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -49664,9 +94946,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -49675,13 +94957,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49691,6 +94974,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -49718,8 +95002,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49737,37 +95021,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 325 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 594 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49775,7 +95059,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -49799,29 +95083,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -49829,10 +95114,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -49840,15 +95125,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -49858,6 +95144,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -49885,8 +95172,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49904,35 +95191,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 326 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + SolutionIndex: 595 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -49940,64 +95227,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 + LSCA: 32 + LSCB: 64 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 2 - LVPB: 2 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50005,13 +95293,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -50021,6 +95312,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -50048,8 +95340,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50067,33 +95359,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 327 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + SolutionIndex: 596 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_8_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -50105,65 +95395,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 + LSCB: 32 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50171,13 +95461,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -50189,7 +95482,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -50215,8 +95508,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50234,33 +95527,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 328 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WGM8 + SolutionIndex: 597 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -50272,7 +95563,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -50299,38 +95590,38 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 8 - LSPB: 4 + LSPB: 8 LVCA: 32 - LVCB: 64 + LVCB: 32 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50338,15 +95629,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -50384,175 +95676,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 329 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM8 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingType: B - _staggerStrideShift: 3 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - InnerUnroll: 1 - InterleaveAlpha: 0 - KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 - LVPA: 2 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 - LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 NumIndicesSummation: 1 - NumIndicesLD: 4 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50570,14 +95695,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 330 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WGM8 + SolutionIndex: 598 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -50586,15 +95711,15 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -50606,14 +95731,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -50632,28 +95757,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -50661,10 +95786,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -50672,13 +95797,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -50716,8 +95844,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50735,15 +95863,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 331 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 + SolutionIndex: 599 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -50756,12 +95884,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - [2, 3, 0, 1] - - - [1024, 128, 1, 128] - [4, 1028.02] @@ -50897,8 +96023,6 @@ - [83, 6071.16] - - [6784, 6784, 1, 1280] - [80, 9535.64] - - - [1024, 256, 1, 3328] - - [74, 5742.58] - - [1408, 4288, 1, 1280] - [83, 8254.99] - - [3584, 4288, 1, 1280] @@ -52377,8 +97501,6 @@ - [56, 5129.81] - - [2368, 3584, 1, 256] - [74, 8998.7] - - - [1024, 256, 1, 1280] - - [81, 3566.58] - - [5056, 3584, 1, 1280] - [75, 9345.07] - - [448, 4, 1, 3328] @@ -54871,4112 +99993,5872 @@ - [162, 5765.37] - - [4096, 3072, 1, 128] - [164, 8869.01] + - - [768, 3072, 1, 4096] + - [176, 10028.7] + - - [64, 256, 192, 256] + - [170, 8791.55] + - - [768, 2, 1, 16] + - [173, 4.95484] + - - [768, 768, 1, 64] + - [169, 3469.55] + - - [768, 768, 1, 4096] + - [177, 7475.0] + - - [768, 30522, 1, 1280] + - [180, 10296.9] + - - [64, 128, 384, 128] + - [170, 7660.83] + - - [768, 30522, 1, 320] + - [178, 10007.9] + - - [768, 768, 1, 32] + - [167, 2359.3] + - - [3072, 768, 1, 4096] + - [176, 10033.7] + - - [768, 30522, 1, 640] + - [179, 10206.7] + - - [64, 64, 768, 64] + - [168, 5494.72] + - - [768, 768, 1, 640] + - [177, 6721.64] + - - [768, 768, 1, 16] + - [166, 1203.72] + - - [768, 768, 1, 1280] + - [175, 7138.57] + - - [768, 2, 1, 32] + - [171, 11.8154] + - - [2048, 2048, 1, 512] + - [191, 9607.57] + - - [512, 32, 1, 200] + - [184, 422.268] + - - [1024, 1, 1, 200] + - [187, 24.6154] + - - [1600, 1024, 1, 512] + - [182, 8115.91] + - - [560, 1024, 1, 200] + - [181, 4810.74] + - - [1024, 1024, 1, 512] + - [190, 8614.74] + - - [2048, 1, 1, 512] + - [185, 80.9086] + - - [512, 512, 1, 200] + - [183, 4398.39] + - - [100, 2048, 1, 512] + - [188, 4443.12] + - - [1024, 1024, 1, 200] + - [189, 6990.51] + - - [1024, 64, 1, 512] + - [186, 2853.27] + - - [1024, 256, 1, 18944] + - [210, 9196.41] + - - [256, 3328, 1, 8976] + - [200, 8299.26] + - - [1024, 256, 1, 4352] + - [208, 8813.74] + - - [256, 9728, 1, 8976] + - [203, 9638.48] + - - [1024, 256, 1, 3072] + - [210, 8640.63] + - - [768, 2048, 1, 256] + - [202, 8662.93] + - - [1024, 256, 1, 19968] + - [207, 9220.86] + - - [256, 12800, 1, 8976] + - [197, 9418.42] + - - [1024, 256, 1, 3328] + - [211, 8682.48] + - - [256, 10240, 1, 8976] + - [204, 10137.7] + - - [1024, 256, 1, 15104] + - [209, 9167.03] + - - [256, 10496, 1, 8976] + - [197, 9858.38] + - - [1024, 256, 1, 2816] + - [212, 8575.71] + - - [1024, 256, 1, 4608] + - [207, 8861.21] + - - [256, 11264, 1, 8976] + - [194, 9627.69] + - - [1024, 256, 1, 6400] + - [207, 8985.23] + - - [1024, 256, 1, 16128] + - [207, 9170.26] + - - [256, 44505, 1, 8976] + - [201, 10331.8] + - - [256, 6144, 1, 8976] + - [204, 10395.0] + - - [1024, 256, 1, 5120] + - [209, 8881.53] + - - [1024, 256, 1, 7936] + - [212, 9023.14] + - - [256, 3840, 1, 8976] + - [199, 9541.28] + - - [1024, 256, 1, 21248] + - [207, 9209.72] + - - [1024, 256, 1, 12032] + - [209, 9156.17] + - - [256, 8192, 1, 8976] + - [206, 10374.4] + - - [1024, 256, 1, 3584] + - [208, 8712.2] + - - [1024, 256, 1, 14336] + - [209, 9162.51] + - - [256, 7168, 1, 8976] + - [195, 9554.86] + - - [1024, 256, 1, 13568] + - [207, 9165.04] + - - [256, 4096, 1, 8976] + - [199, 10146.6] + - - [1024, 256, 1, 4096] + - [208, 8783.88] + - - [256, 2560, 1, 8976] + - [198, 8381.56] + - - [256, 20992, 1, 8976] + - [197, 9989.86] + - - [256, 4352, 1, 8976] + - [198, 9634.92] + - - [256, 33536, 1, 8976] + - [197, 10218.1] + - - [256, 3584, 1, 8976] + - [199, 8924.5] + - - [256, 26112, 1, 8976] + - [198, 10272.3] + - - [256, 14336, 1, 8976] + - [202, 10217.3] + - - [1024, 256, 1, 14848] + - [209, 9185.19] + - - [1024, 256, 1, 8448] + - [210, 9025.89] + - - [1024, 256, 1, 28672] + - [207, 9256.4] + - - [1024, 256, 1, 5632] + - [207, 8932.69] + - - [256, 22016, 1, 8976] + - [202, 10151.9] + - - [1024, 256, 1, 33536] + - [207, 9243.07] + - - [256, 5120, 1, 8976] + - [193, 9418.05] + - - [256, 11520, 1, 8976] + - [200, 9701.0] + - - [256, 19968, 1, 8976] + - [198, 10228.0] + - - [1024, 256, 1, 5376] + - [209, 8892.52] + - - [1024, 256, 1, 22016] + - [207, 9244.24] + - - [256, 8960, 1, 8976] + - [198, 9841.31] + - - [1024, 256, 1, 15872] + - [207, 9223.15] + - - [256, 17408, 1, 8976] + - [202, 9785.77] + - - [256, 5632, 1, 8976] + - [202, 9564.22] + - - [256, 32512, 1, 8976] + - [201, 10357.9] + - - [256, 11008, 1, 8976] + - [194, 9445.13] + - - [1024, 256, 1, 6144] + - [209, 8955.81] + - - [256, 4864, 1, 8976] + - [194, 8979.35] + - - [256, 15104, 1, 8976] + - [197, 10007.0] + - - [1024, 256, 1, 9984] + - [207, 9110.43] + - - [256, 1280, 1, 8976] + - [193, 5944.34] + - - [1024, 256, 1, 1024] + - [209, 7005.1] + - - [1024, 256, 1, 9728] + - [209, 9066.19] + - - [1024, 256, 1, 10496] + - [207, 9118.05] + - - [256, 11776, 1, 8976] + - [204, 9911.64] + - - [256, 12544, 1, 8976] + - [197, 9235.25] + - - [1024, 256, 1, 17152] + - [207, 9152.21] + - - [1024, 256, 1, 11520] + - [209, 9146.77] + - - [1024, 256, 1, 21504] + - [209, 9207.42] + - - [256, 17152, 1, 8976] + - [196, 9654.71] + - - [1024, 256, 1, 17408] + - [207, 9181.17] + - - [256, 15872, 1, 8976] + - [205, 10086.4] + - - [256, 18688, 1, 8976] + - [198, 9612.47] + - - [256, 5888, 1, 8976] + - [202, 9988.33] + - - [512, 2048, 1, 256] + - [192, 7678.36] + - - [1024, 256, 1, 7680] + - [210, 9032.96] + - - [1024, 256, 1, 1280] + - [212, 7767.23] + - - [256, 14848, 1, 8976] + - [198, 9852.66] + - - [256, 9984, 1, 8976] + - [204, 9908.87] + - - [256, 20480, 1, 8976] + - [202, 10337.1] + - - [1024, 256, 1, 8192] + - [209, 9044.32] + - - [1024, 256, 1, 19712] + - [208, 9184.18] + - - [256, 13568, 1, 8976] + - [198, 9927.82] + - - [256, 13312, 1, 8976] + - [197, 9757.91] + - - [256, 2816, 1, 8976] + - [197, 9191.43] + - - [1024, 256, 1, 2304] + - [208, 8444.91] + - - [256, 21248, 1, 8976] + - [198, 10127.5] + - - [256, 16128, 1, 8976] + - [206, 10238.4] + - - [256, 512, 36, 98] + - [229, 7994.85] + - - [64, 192, 36, 25088] + - [298, 8613.89] + - - [128, 128, 64, 25] + - [228, 2540.15] + - - [256, 256, 64, 56] + - [229, 6924.56] + - - [512, 486, 36, 800] + - [236, 8994.84] + - - [512, 512, 36, 1568] + - [247, 9872.38] + - - [64, 192, 64, 3200] + - [292, 9295.89] + - - [256, 384, 36, 4096] + - [292, 9334.61] + - - [128, 256, 64, 32] + - [231, 4279.9] + - - [64, 128, 64, 23104] + - [298, 10103.1] + - - [128, 256, 64, 9] + - [222, 1709.63] + - - [256, 512, 36, 784] + - [232, 9520.73] + - - [256, 324, 36, 32] + - [270, 4473.38] + - - [512, 512, 36, 33] + - [241, 5925.17] + - - [16, 32, 36, 5760] + - [245, 1448.8] + - - [192, 384, 64, 128] + - [292, 8618.43] + - - [512, 512, 64, 72] + - [248, 8260.12] + - - [128, 128, 64, 1600] + - [221, 9008.38] + - - [512, 512, 36, 128] + - [292, 8871.62] + - - [192, 384, 64, 2304] + - [221, 9657.16] + - - [384, 256, 64, 450] + - [257, 9538.93] + - - [3, 64, 36, 6272] + - [245, 509.784] + - - [3, 64, 64, 2888] + - [274, 708.621] + - - [384, 256, 64, 2304] + - [257, 10287.5] + - - [512, 512, 64, 144] + - [292, 9226.7] + - - [256, 256, 36, 6272] + - [232, 9607.28] + - - [80, 192, 64, 4608] + - [293, 7347.93] + - - [64, 64, 36, 3136] + - [280, 5959.05] + - - [256, 384, 64, 2304] + - [257, 10283.4] + - - [512, 512, 36, 66] + - [241, 7618.08] + - - [128, 256, 64, 800] + - [267, 9611.15] + - - [64, 128, 36, 30] + - [223, 1242.61] + - - [192, 256, 36, 512] + - [292, 8657.97] + - - [256, 512, 64, 200] + - [292, 9153.87] + - - [256, 512, 64, 25] + - [270, 5349.88] + - - [3, 64, 64, 46208] + - [273, 808.562] + - - [128, 256, 36, 1568] + - [265, 8528.62] + - - [64, 128, 64, 11552] + - [298, 9997.0] + - - [128, 192, 64, 946] + - [292, 9198.38] + - - [64, 192, 64, 12800] + - [253, 9000.66] + - - [224, 224, 64, 128] + - [230, 6312.07] + - - [128, 256, 64, 288] + - [292, 8697.87] + - - [64, 64, 64, 826] + - [235, 6650.21] + - - [256, 384, 64, 1152] + - [267, 10106.8] + - - [3, 64, 64, 92416] + - [273, 812.031] + - - [32, 32, 36, 43808] + - [214, 2813.09] + - - [160, 320, 64, 288] + - [224, 8090.86] + - - [1, 16, 36, 23040] + - [261, 42.6667] + - - [128, 256, 36, 128] + - [239, 6049.48] + - - [128, 128, 64, 3360] + - [292, 9199.96] + - - [128, 128, 64, 420] + - [292, 8131.5] + - - [64, 128, 64, 361] + - [229, 6937.98] + - - [512, 512, 36, 16] + - [285, 3797.66] + - - [384, 256, 36, 800] + - [226, 9151.65] + - - [192, 384, 36, 4096] + - [226, 8867.57] + - - [64, 64, 64, 1600] + - [278, 7931.74] + - - [256, 384, 64, 576] + - [258, 9745.8] + - - [512, 512, 64, 14] + - [241, 3638.18] + - - [512, 512, 36, 8] + - [216, 2279.51] + - - [512, 486, 64, 128] + - [232, 8337.83] + - - [1, 16, 64, 640] + - [266, 49.9512] + - - [64, 96, 64, 288] + - [291, 5707.97] + - - [96, 96, 36, 1568] + - [260, 6866.75] + - - [256, 256, 36, 128] + - [264, 7703.82] + - - [64, 128, 36, 53824] + - [252, 6331.31] + - - [256, 256, 36, 32] + - [248, 4648.86] + - - [192, 256, 64, 288] + - [292, 8987.79] + - - [256, 256, 36, 16] + - [262, 2912.71] + - - [128, 256, 36, 3200] + - [265, 8680.27] + - - [160, 320, 64, 512] + - [224, 8449.44] + - - [128, 160, 36, 512] + - [235, 7214.97] + - - [96, 96, 36, 2592] + - [230, 7104.79] + - - [64, 96, 64, 800] + - [260, 7268.32] + - - [147, 64, 36, 18816] + - [276, 7116.26] + - - [160, 320, 36, 512] + - [230, 7874.82] + - - [256, 512, 36, 4] + - [269, 1034.78] + - - [96, 128, 64, 946] + - [252, 7901.07] + - - [256, 324, 64, 1568] + - [257, 8589.53] + - - [128, 128, 64, 50] + - [248, 4070.56] + - - [35, 96, 36, 8960] + - [242, 4207.3] + - - [32, 64, 36, 43808] + - [283, 4390.81] + - - [160, 224, 36, 128] + - [230, 5446.92] + - - [64, 64, 64, 81] + - [255, 2391.18] + - - [256, 256, 36, 3200] + - [221, 9559.55] + - - [256, 256, 36, 210] + - [232, 8414.61] + - - [192, 384, 64, 576] + - [292, 9468.75] + - - [512, 512, 64, 800] + - [267, 10096.4] + - - [512, 24, 36, 800] + - [218, 4761.77] + - - [64, 64, 64, 13216] + - [279, 8491.41] + - - [192, 224, 64, 1152] + - [235, 8769.06] + - - [256, 256, 64, 1152] + - [257, 9988.09] + - - [512, 486, 64, 512] + - [267, 9254.67] + - - [128, 128, 36, 784] + - [230, 7468.06] + - - [256, 512, 64, 1600] + - [254, 10232.5] + - - [512, 512, 64, 9] + - [248, 2599.78] + - - [96, 128, 64, 288] + - [260, 6599.43] + - - [64, 96, 36, 512] + - [260, 5073.75] + - - [256, 512, 36, 1568] + - [292, 9637.81] + - - [128, 128, 64, 400] + - [292, 8192.0] + - - [128, 128, 64, 800] + - [292, 8716.34] + - - [96, 128, 36, 512] + - [280, 6756.93] + - - [16, 32, 36, 360] + - [243, 754.036] + - - [128, 256, 64, 3200] + - [257, 10222.5] + - - [96, 128, 64, 800] + - [260, 7967.9] + - - [256, 512, 64, 4] + - [222, 1097.99] + - - [256, 256, 64, 450] + - [267, 9347.45] + - - [64, 64, 64, 3200] + - [278, 8518.08] + - - [192, 224, 64, 128] + - [238, 7035.17] + - - [128, 128, 64, 288] + - [292, 7751.28] + - - [256, 256, 64, 72] + - [248, 7489.83] + - - [96, 208, 36, 512] + - [260, 6939.11] + - - [128, 256, 36, 3136] + - [235, 8669.33] + - - [64, 64, 36, 3520] + - [230, 6007.47] + - - [64, 128, 36, 1568] + - [293, 6897.7] + - - [160, 320, 64, 242] + - [219, 7873.17] + - - [192, 192, 36, 512] + - [230, 7707.32] + - - [512, 512, 36, 512] + - [292, 9582.42] + - - [1, 16, 64, 10240] + - [244, 71.3511] + - - [128, 128, 36, 512] + - [230, 7149.38] + - - [512, 512, 36, 256] + - [221, 9384.4] + - - [512, 512, 36, 1024] + - [215, 9777.89] + - - [96, 208, 64, 1152] + - [293, 7850.9] + - - [128, 192, 64, 3200] + - [221, 9490.82] + - - [256, 256, 36, 4096] + - [226, 9585.46] + - - [160, 160, 64, 288] + - [260, 7299.8] + - - [256, 256, 64, 896] + - [257, 9850.33] + - - [128, 256, 64, 242] + - [292, 8391.38] + - - [128, 128, 36, 440] + - [235, 6274.72] + - - [96, 128, 36, 1568] + - [280, 7875.03] + - - [192, 384, 36, 1024] + - [226, 8715.72] + - - [64, 96, 36, 10368] + - [297, 7478.59] + - - [128, 256, 64, 100] + - [241, 7084.97] + - - [112, 224, 36, 2048] + - [234, 7555.92] + - - [384, 256, 64, 1152] + - [257, 10102.3] + - - [192, 384, 36, 128] + - [292, 7543.04] + - - [128, 128, 36, 7040] + - [265, 7600.6] + - - [128, 256, 64, 1568] + - [257, 10005.9] + - - [128, 128, 36, 1568] + - [249, 7848.3] + - - [128, 256, 64, 72] + - [272, 6553.6] + - - [256, 256, 36, 12544] + - [286, 9365.04] + - - [256, 256, 36, 105] + - [248, 7286.06] + - - [128, 256, 36, 392] + - [235, 7625.69] + - - [64, 64, 64, 5408] + - [278, 8882.67] + - - [3, 64, 36, 25088] + - [245, 528.942] + - - [384, 256, 36, 1024] + - [292, 9182.75] + - - [35, 96, 36, 13440] + - [299, 4110.29] + - - [128, 256, 64, 1152] + - [257, 9804.87] + - - [256, 324, 64, 32] + - [270, 5043.63] + - - [160, 224, 64, 128] + - [284, 6046.15] + - - [192, 224, 36, 2592] + - [282, 8878.68] + - - [96, 96, 64, 1152] + - [260, 8035.45] + - - [32, 64, 36, 90] + - [217, 964.465] + - - [64, 128, 64, 2888] + - [232, 9047.23] + - - [256, 384, 36, 800] + - [292, 9154.02] + - - [512, 512, 64, 4] + - [289, 1233.62] + - - [192, 320, 36, 128] + - [229, 7388.19] + - - [64, 128, 36, 480] + - [293, 5653.27] + - - [192, 384, 64, 242] + - [292, 9079.99] + - - [256, 486, 64, 32] + - [285, 5909.18] + - - [147, 64, 64, 9702] + - [294, 7319.69] + - - [512, 512, 64, 64] + - [228, 8179.02] + - - [64, 192, 64, 3698] + - [221, 9287.89] + - - [73, 192, 64, 10439] + - [252, 6668.02] + - - [1, 16, 36, 1440] + - [268, 33.4452] + - - [128, 256, 36, 512] + - [235, 7989.15] + - - [512, 512, 64, 576] + - [267, 9951.89] + - - [64, 64, 36, 12544] + - [283, 5872.77] + - - [128, 128, 36, 880] + - [280, 7597.26] + - - [192, 224, 36, 128] + - [238, 6451.2] + - - [64, 64, 64, 800] + - [278, 6916.73] + - - [64, 128, 36, 12544] + - [256, 6395.88] + - - [64, 64, 36, 1568] + - [230, 5536.66] + - - [160, 160, 36, 512] + - [230, 7345.26] + - - [512, 24, 64, 512] + - [220, 5242.88] + - - [3, 64, 36, 3136] + - [245, 475.352] + - - [256, 256, 64, 9] + - [270, 2106.51] + - - [3, 64, 64, 11552] + - [273, 785.127] + - - [128, 256, 36, 12544] + - [288, 8792.13] + - - [128, 128, 36, 3136] + - [249, 8098.46] + - - [256, 512, 36, 3136] + - [232, 9694.39] + - - [64, 64, 36, 196] + - [246, 2757.76] + - - [144, 288, 36, 512] + - [280, 7077.89] + - - [256, 24, 64, 32] + - [259, 1483.83] + - - [384, 384, 36, 800] + - [221, 9246.5] + - - [512, 512, 64, 1600] + - [267, 10277.3] + - - [112, 224, 36, 512] + - [235, 6744.78] + - - [128, 128, 36, 49] + - [241, 2716.29] + - - [512, 512, 36, 4] + - [269, 1156.52] + - - [35, 96, 64, 4235] + - [230, 4631.28] + - - [192, 384, 64, 450] + - [221, 9372.2] + - - [256, 256, 36, 1024] + - [292, 9346.64] + - - [112, 224, 64, 1152] + - [235, 7523.95] + - - [256, 512, 64, 400] + - [254, 9597.95] + - - [149, 32, 36, 19072] + - [299, 5811.8] + - - [128, 256, 36, 6272] + - [235, 8754.68] + - - [128, 192, 36, 1568] + - [260, 8195.1] + - - [256, 256, 36, 512] + - [292, 9074.22] + - - [256, 256, 64, 112] + - [292, 8305.55] + - - [512, 512, 64, 18] + - [285, 4324.02] + - - [256, 256, 64, 18] + - [248, 3547.81] + - - [256, 256, 64, 1568] + - [257, 10141.7] + - - [64, 96, 36, 1568] + - [278, 6805.66] + - - [384, 256, 36, 4096] + - [292, 9311.1] + - - [256, 512, 64, 800] + - [267, 9998.35] + - - [256, 384, 36, 2048] + - [292, 9285.34] + - - [3, 64, 36, 200704] + - [274, 547.375] + - - [384, 384, 64, 2304] + - [215, 9901.68] + - - [160, 320, 64, 128] + - [251, 7113.81] + - - [512, 512, 36, 528] + - [221, 9567.65] + - - [160, 320, 36, 128] + - [252, 6411.13] + - - [96, 96, 64, 800] + - [260, 7690.01] + - - [256, 512, 36, 49] + - [248, 6721.25] + - - [384, 384, 64, 450] + - [221, 9523.53] + - - [3, 64, 64, 23104] + - [273, 801.621] + - - [256, 256, 64, 3200] + - [257, 10300.4] + - - [128, 192, 36, 512] + - [235, 7499.75] + - - [192, 192, 64, 288] + - [292, 8774.24] + - - [96, 208, 64, 242] + - [252, 5901.99] + - - [256, 16, 36, 3200] + - [281, 3807.77] + - - [512, 512, 64, 8] + - [259, 2379.75] + - - [64, 128, 64, 5776] + - [232, 9332.74] + - - [512, 512, 64, 288] + - [221, 9521.99] + - - [256, 16, 36, 32] + - [277, 766.005] + - - [128, 192, 64, 288] + - [292, 8527.58] + - - [32, 64, 64, 640] + - [260, 4660.34] + - - [64, 64, 36, 392] + - [260, 3686.4] + - - [384, 384, 36, 1024] + - [226, 9282.48] + - - [64, 64, 36, 11552] + - [290, 5904.78] + - - [96, 128, 36, 6272] + - [280, 8350.99] + - - [128, 256, 36, 16] + - [262, 2144.81] + - - [256, 256, 64, 288] + - [292, 9140.13] + - - [64, 64, 64, 1652] + - [278, 7766.53] + - - [256, 384, 36, 1024] + - [226, 9203.27] + - - [96, 128, 64, 3200] + - [295, 8866.2] + - - [256, 324, 36, 3200] + - [234, 8194.25] + - - [128, 192, 64, 800] + - [292, 9198.03] + - - [64, 128, 64, 10] + - [233, 851.117] + - - [96, 208, 64, 288] + - [260, 6667.58] + - - [64, 96, 36, 2592] + - [242, 7216.88] + - - [64, 128, 64, 160] + - [271, 5190.97] + - - [192, 384, 64, 512] + - [221, 9446.04] + - - [64, 64, 36, 6272] + - [230, 6212.01] + - - [512, 24, 36, 288] + - [227, 3922.47] + - - [128, 128, 64, 1568] + - [221, 9037.86] + - - [112, 224, 64, 242] + - [291, 6399.26] + - - [128, 256, 64, 1600] + - [257, 10010.3] + - - [32, 32, 64, 20000] + - [225, 4378.41] + - - [160, 192, 64, 288] + - [252, 7803.63] + - - [512, 24, 64, 128] + - [213, 3733.8] + - - [512, 512, 36, 32] + - [248, 5935.34] + - - [3, 64, 36, 100352] + - [245, 542.783] + - - [3, 64, 64, 1444] + - [274, 674.159] + - - [512, 512, 36, 3136] + - [215, 9921.1] + - - [128, 256, 64, 6400] + - [275, 10349.3] + - - [256, 256, 36, 2048] + - [292, 9518.99] + - - [128, 160, 64, 288] + - [235, 7549.75] + - - [256, 256, 64, 6400] + - [257, 10392.6] + - - [32, 64, 64, 20000] + - [283, 6493.86] + - - [256, 256, 36, 1680] + - [232, 9513.29] + - - [128, 128, 64, 210] + - [292, 7094.1] + - - [192, 384, 36, 2048] + - [221, 8818.65] + - - [256, 256, 64, 144] + - [292, 8608.61] + - - [384, 384, 36, 4096] + - [226, 9356.94] + - - [160, 320, 64, 1152] + - [252, 8749.48] + - - [384, 256, 36, 2048] + - [292, 9279.63] + - - [256, 512, 36, 392] + - [292, 9252.14] + - - [256, 512, 64, 50] + - [248, 7511.29] + - - [73, 192, 36, 23360] + - [296, 5802.93] + - - [3, 64, 36, 50176] + - [245, 542.037] + - - [384, 384, 36, 2048] + - [221, 9325.8] + - - [256, 384, 64, 450] + - [267, 9528.66] + - - [192, 320, 64, 128] + - [226, 8399.81] + - - [128, 256, 36, 32] + - [241, 3276.8] + - - [160, 192, 36, 512] + - [280, 7752.34] + - - [512, 512, 64, 256] + - [232, 9473.64] + - - [256, 512, 64, 32] + - [270, 6391.32] + - - [384, 384, 64, 576] + - [221, 9614.79] + - - [64, 64, 64, 648] + - [278, 6282.15] + - - [512, 486, 36, 288] + - [292, 8624.93] + - - [32, 64, 36, 1440] + - [230, 3961.5] + - - [144, 288, 64, 242] + - [252, 6347.02] + - - [384, 256, 64, 576] + - [257, 9775.24] + - - [512, 512, 36, 64] + - [228, 7791.28] + - - [448, 384, 64, 128] + - [221, 9132.23] + - - [64, 128, 64, 722] + - [271, 8047.11] + - - [144, 288, 64, 288] + - [280, 6859.4] + - - [512, 512, 64, 224] + - [292, 9427.29] + - - [112, 224, 64, 288] + - [291, 6736.92] + - - [384, 384, 64, 1152] + - [215, 9820.46] + - - [448, 384, 36, 128] + - [292, 8761.31] + - - [64, 64, 64, 100] + - [238, 2708.1] + - - [256, 486, 36, 128] + - [264, 7640.04] + - - [64, 96, 64, 4608] + - [293, 8351.49] + - - [16, 32, 64, 160] + - [217, 736.36] + - - [64, 192, 36, 6272] + - [293, 8041.19] + - - [64, 64, 64, 200] + - [246, 3924.31] + - - [256, 256, 36, 800] + - [292, 9299.55] + - - [64, 128, 36, 6272] + - [290, 6816.36] + - - [32, 64, 64, 40] + - [237, 885.622] + - - [256, 16, 64, 32] + - [287, 1205.26] + - - [192, 384, 36, 800] + - [226, 8673.88] + - - [128, 128, 36, 3200] + - [260, 8538.89] + - - [256, 256, 36, 256] + - [232, 8454.36] + - - [192, 384, 64, 1152] + - [221, 9589.01] + - - [128, 256, 64, 200] + - [231, 8141.12] + - - [64, 96, 64, 1152] + - [260, 7620.88] + - - [128, 128, 36, 392] + - [235, 6175.51] + - - [80, 192, 36, 10368] + - [283, 6497.16] + - - [224, 224, 36, 128] + - [293, 5826.89] + - - [512, 512, 64, 28] + - [248, 5728.81] + - - [256, 16, 64, 1568] + - [263, 4637.2] + - - [144, 288, 64, 1152] + - [280, 7784.24] + - - [256, 256, 64, 576] + - [257, 9596.12] + - - [64, 128, 36, 784] + - [293, 6058.99] + - - [256, 24, 36, 128] + - [227, 2239.84] + - - [256, 256, 64, 2304] + - [257, 10225.7] + - - [192, 384, 36, 512] + - [292, 8549.03] + - - [16, 32, 64, 2560] + - [245, 2153.13] + - - [256, 512, 36, 32] + - [270, 5702.23] + - - [512, 512, 64, 128] + - [292, 9084.11] + - - [128, 128, 64, 200] + - [229, 6971.91] + - - [512, 512, 64, 32] + - [241, 6248.5] + - - [128, 256, 36, 196] + - [241, 6628.76] + - - [8, 384, 64, 6600] + - [273, 2733.89] + - - [149, 32, 64, 8195] + - [235, 6050.91] + - - [35, 96, 64, 6160] + - [280, 4689.35] + - - [64, 64, 36, 1760] + - [230, 5622.24] - - [1024, 128, 1, 128] - - [170, 1028.12] + - [304, 1028.12] - - [4, 704, 1, 1280] - - [209, 363.455] + - [343, 363.455] - - [4, 1856, 1, 3328] - - [209, 579.534] + - [343, 579.534] - - [1856, 448, 1, 3328] - - [246, 6966.83] + - [380, 6966.83] - - [2944, 4288, 1, 1280] - - [241, 9057.98] + - [375, 9057.98] - - [2368, 64, 1, 3328] - - [202, 5837.66] + - [336, 5837.66] - - [2368, 5888, 1, 256] - - [246, 9111.16] + - [380, 9111.16] - - [128, 64, 1, 256] - - [208, 374.591] + - [342, 374.591] - - [5888, 1024, 1, 1280] - - [251, 8570.54] + - [385, 8570.54] - - [128, 6784, 1, 3328] - - [214, 7703.96] + - [348, 7703.96] - - [64, 4, 1, 256] - - [260, 11.3219] + - [394, 11.3219] - - [5888, 1856, 1, 3328] - - [246, 9394.4] + - [380, 9394.4] - - [5056, 704, 1, 256] - - [249, 8026.99] + - [383, 8026.99] - - [5888, 2944, 1, 3328] - - [239, 7608.21] + - [373, 7608.21] - - [1856, 4288, 1, 256] - - [240, 8986.42] + - [374, 8986.42] - - [1024, 5056, 1, 128] - - [232, 3898.34] + - [366, 3898.34] - - [5056, 5056, 1, 3328] - - [240, 9536.85] + - [374, 9536.85] - - [1408, 5888, 1, 1280] - - [241, 9279.19] + - [375, 9279.19] - - [2368, 448, 1, 128] - - [233, 2474.42] + - [367, 2474.42] - - [1024, 3584, 1, 3328] - - [243, 9258.58] + - [377, 9258.58] - - [4, 2944, 1, 1280] - - [195, 611.84] + - [329, 611.84] - - [1408, 64, 1, 128] - - [166, 858.31] + - [300, 858.31] - - [256, 4288, 1, 3328] - - [246, 7616.08] + - [380, 7616.08] - - [5888, 1408, 1, 1280] - - [239, 9620.39] + - [373, 9620.39] - - [704, 1856, 1, 3328] - - [240, 9033.75] + - [374, 9033.75] - - [4, 1408, 1, 128] - - [253, 24.455] + - [387, 24.455] - - [1024, 2368, 1, 256] - - [240, 7526.25] + - [374, 7526.25] - - [1408, 1856, 1, 1280] - - [243, 8324.19] + - [377, 8324.19] - - [1408, 64, 1, 1280] - - [214, 4681.24] + - [348, 4681.24] - - [448, 1024, 1, 1280] - - [240, 7112.53] + - [374, 7112.53] - - [256, 1408, 1, 3328] - - [246, 5825.51] + - [380, 5825.51] - - [5056, 5056, 1, 1280] - - [249, 9233.65] + - [383, 9233.65] - - [448, 5056, 1, 256] - - [241, 7003.27] + - [375, 7003.27] - - [704, 1856, 1, 1280] - - [240, 8877.38] + - [374, 8877.38] - - [128, 5056, 1, 128] - - [232, 2301.14] + - [366, 2301.14] - - [2368, 128, 1, 256] - - [240, 3849.04] + - [374, 3849.04] - - [1856, 1408, 1, 128] - - [235, 4202.31] + - [369, 4202.31] - - [64, 5056, 1, 256] - - [241, 3109.62] + - [375, 3109.62] - - [6784, 256, 1, 3328] - - [240, 6388.53] + - [374, 6388.53] - - [6784, 4288, 1, 3328] - - [251, 9114.67] + - [385, 9114.67] - - [4288, 448, 1, 256] - - [244, 5783.05] + - [378, 5783.05] - - [64, 704, 1, 128] - - [177, 379.519] + - [311, 379.519] - - [1856, 2368, 1, 3328] - - [240, 9128.46] + - [374, 9128.46] - - [4288, 2944, 1, 1280] - - [246, 9182.33] + - [380, 9182.33] - - [704, 5056, 1, 1280] - - [240, 9071.57] + - [374, 9071.57] - - [2368, 704, 1, 3328] - - [246, 7731.43] + - [380, 7731.43] - - [256, 5888, 1, 256] - - [240, 7920.38] + - [374, 7920.38] - - [1856, 4288, 1, 3328] - - [246, 9330.07] + - [380, 9330.07] - - [256, 2944, 1, 256] - - [247, 5312.27] + - [381, 5312.27] - - [5888, 1024, 1, 256] - - [238, 6710.97] + - [372, 6710.97] - - [448, 64, 1, 1280] - - [213, 2814.53] + - [347, 2814.53] - - [448, 5056, 1, 3328] - - [240, 8255.53] + - [374, 8255.53] - - [3584, 4, 1, 1280] - - [189, 640.815] + - [323, 640.815] - - [2944, 64, 1, 256] - - [188, 2621.54] + - [322, 2621.54] - - [128, 4, 1, 1280] - - [260, 86.3316] + - [394, 86.3316] - - [1408, 2944, 1, 256] - - [240, 8848.99] + - [374, 8848.99] - - [256, 1856, 1, 1280] - - [240, 7366.55] + - [374, 7366.55] - - [6784, 5056, 1, 3328] - - [251, 8332.16] + - [385, 8332.16] - - [5056, 5056, 1, 256] - - [246, 9171.74] + - [380, 9171.74] - - [1408, 6784, 1, 128] - - [232, 5079.19] + - [366, 5079.19] - - [64, 1024, 1, 1280] - - [204, 3679.31] + - [338, 3679.31] - - [2944, 4, 1, 256] - - [195, 369.543] + - [329, 369.543] - - [704, 5056, 1, 128] - - [232, 4509.27] + - [366, 4509.27] - - [4, 2368, 1, 1280] - - [189, 569.844] + - [323, 569.844] - - [2368, 2944, 1, 1280] - - [251, 7451.14] + - [385, 7451.14] - - [128, 3584, 1, 1280] - - [249, 6071.26] + - [383, 6071.26] - - [6784, 6784, 1, 1280] - - [246, 9535.74] - - - [1024, 256, 1, 3328] - - [240, 5742.68] + - [380, 9535.74] - - [1408, 4288, 1, 1280] - - [249, 8255.09] + - [383, 8255.09] - - [3584, 4288, 1, 1280] - - [251, 9651.19] + - [385, 9651.19] - - [2368, 704, 1, 1280] - - [246, 8291.4] + - [380, 8291.4] - - [5056, 4288, 1, 3328] - - [238, 9406.36] + - [372, 9406.36] - - [3584, 2368, 1, 3328] - - [246, 9350.32] + - [380, 9350.32] - - [64, 704, 1, 1280] - - [213, 3384.59] + - [347, 3384.59] - - [4288, 256, 1, 256] - - [246, 5593.62] + - [380, 5593.62] - - [2944, 128, 1, 128] - - [168, 2130.6] + - [302, 2130.6] - - [6784, 448, 1, 1280] - - [249, 8815.85] + - [383, 8815.85] - - [1408, 2944, 1, 128] - - [232, 4558.34] + - [366, 4558.34] - - [4288, 2944, 1, 256] - - [251, 7865.43] + - [385, 7865.43] - - [5888, 704, 1, 1280] - - [240, 9262.99] + - [374, 9262.99] - - [1856, 64, 1, 1280] - - [214, 4359.15] + - [348, 4359.15] - - [448, 5888, 1, 128] - - [235, 4000.59] + - [369, 4000.59] - - [5888, 64, 1, 3328] - - [215, 6603.39] + - [349, 6603.39] - - [2944, 256, 1, 3328] - - [240, 8423.63] + - [374, 8423.63] - - [1024, 64, 1, 128] - - [185, 582.642] + - [319, 582.642] - - [5056, 2368, 1, 1280] - - [240, 9419.91] + - [374, 9419.91] - - [448, 3584, 1, 1280] - - [240, 7985.82] + - [374, 7985.82] - - [6784, 5888, 1, 256] - - [238, 9494.36] + - [372, 9494.36] - - [704, 1024, 1, 128] - - [232, 2813.35] + - [366, 2813.35] - - [704, 128, 1, 1280] - - [214, 4477.71] + - [348, 4477.71] - - [5888, 2944, 1, 128] - - [235, 4745.96] + - [369, 4745.96] - - [4, 3584, 1, 128] - - [252, 96.479] + - [386, 96.479] - - [1408, 448, 1, 1280] - - [240, 6912.8] + - [374, 6912.8] - - [1024, 1408, 1, 256] - - [248, 5810.85] + - [382, 5810.85] - - [2368, 2368, 1, 3328] - - [249, 9088.71] + - [383, 9088.71] - - [1856, 6784, 1, 128] - - [235, 5168.32] + - [369, 5168.32] - - [5056, 704, 1, 3328] - - [241, 7464.9] + - [375, 7464.9] - - [1408, 1856, 1, 256] - - [246, 6727.69] + - [380, 6727.69] - - [1408, 704, 1, 3328] - - [246, 8379.53] + - [380, 8379.53] - - [2368, 5056, 1, 256] - - [246, 8664.11] + - [380, 8664.11] - - [5888, 1856, 1, 256] - - [251, 5810.02] + - [385, 5810.02] - - [4288, 64, 1, 3328] - - [228, 6583.94] + - [362, 6583.94] - - [2368, 4, 1, 1280] - - [261, 545.251] + - [395, 545.251] - - [704, 5888, 1, 256] - - [246, 8813.71] + - [380, 8813.71] - - [4288, 64, 1, 256] - - [204, 3059.97] + - [338, 3059.97] - - [6784, 64, 1, 256] - - [246, 3490.96] + - [380, 3490.96] - - [2944, 256, 1, 256] - - [240, 6970.4] + - [374, 6970.4] - - [2944, 6784, 1, 3328] - - [240, 9475.79] + - [374, 9475.79] - - [704, 1408, 1, 3328] - - [240, 8154.18] + - [374, 8154.18] - - [3584, 704, 1, 3328] - - [240, 8995.07] + - [374, 8995.07] - - [2944, 256, 1, 128] - - [232, 2824.13] + - [366, 2824.13] - - [6784, 4, 1, 1280] - - [189, 625.714] + - [323, 625.714] - - [1024, 64, 1, 1280] - - [201, 3307.91] + - [335, 3307.91] - - [448, 4288, 1, 256] - - [246, 6074.48] + - [380, 6074.48] - - [64, 3584, 1, 3328] - - [194, 6200.26] + - [328, 6200.26] - - [704, 2368, 1, 1280] - - [240, 8291.4] + - [374, 8291.4] - - [448, 2944, 1, 128] - - [232, 3221.87] + - [366, 3221.87] - - [1856, 2368, 1, 1280] - - [251, 6855.24] + - [385, 6855.24] - - [2368, 128, 1, 3328] - - [202, 6479.61] + - [336, 6479.61] - - [2944, 128, 1, 256] - - [240, 3828.23] + - [374, 3828.23] - - [448, 1408, 1, 256] - - [241, 4525.9] + - [375, 4525.9] - - [1856, 4288, 1, 1280] - - [239, 9160.32] + - [373, 9160.32] - - [64, 5056, 1, 3328] - - [222, 6819.3] + - [356, 6819.3] - - [4, 704, 1, 256] - - [206, 123.541] + - [340, 123.541] - - [1024, 448, 1, 128] - - [235, 1989.27] + - [369, 1989.27] - - [704, 4, 1, 1280] - - [209, 381.931] + - [343, 381.931] - - [704, 256, 1, 128] - - [232, 1109.17] + - [366, 1109.17] - - [704, 2944, 1, 128] - - [232, 4089.03] + - [366, 4089.03] - - [1408, 1024, 1, 1280] - - [246, 8192.08] + - [380, 8192.08] - - [704, 6784, 1, 256] - - [240, 6717.9] + - [374, 6717.9] - - [6784, 704, 1, 256] - - [246, 5429.22] + - [380, 5429.22] - - [5056, 1408, 1, 128] - - [232, 4954.5] + - [366, 4954.5] - - [256, 3584, 1, 3328] - - [240, 7890.96] + - [374, 7890.96] - - [4, 5888, 1, 3328] - - [257, 691.047] + - [391, 691.047] - - [128, 1408, 1, 128] - - [179, 1393.14] + - [313, 1393.14] - - [3584, 4288, 1, 3328] - - [242, 8900.87] + - [376, 8900.87] - - [5888, 1856, 1, 1280] - - [243, 9345.85] + - [377, 9345.85] - - [5056, 1024, 1, 3328] - - [244, 7834.84] + - [378, 7834.84] - - [5056, 64, 1, 1280] - - [222, 5890.14] + - [356, 5890.14] - - [1024, 704, 1, 256] - - [240, 6007.57] + - [374, 6007.57] - - [1024, 4288, 1, 128] - - [234, 3497.09] + - [368, 3497.09] - - [4288, 64, 1, 1280] - - [219, 4726.59] + - [353, 4726.59] - - [2368, 3584, 1, 1280] - - [238, 8128.82] + - [372, 8128.82] - - [2368, 6784, 1, 1280] - - [238, 9478.72] + - [372, 9478.72] - - [1024, 256, 1, 256] - - [246, 4092.1] + - [380, 4092.1] - - [1856, 4, 1, 1280] - - [261, 509.903] + - [395, 509.903] - - [448, 448, 1, 256] - - [246, 3001.28] + - [380, 3001.28] - - [2944, 3584, 1, 3328] - - [247, 9081.91] + - [381, 9081.91] - - [128, 4288, 1, 128] - - [167, 2323.33] + - [301, 2323.33] - - [64, 448, 1, 256] - - [210, 1066.97] + - [344, 1066.97] - - [128, 1024, 1, 3328] - - [223, 6392.36] + - [357, 6392.36] - - [4, 1408, 1, 3328] - - [206, 616.656] + - [340, 616.656] - - [6784, 2944, 1, 256] - - [249, 8547.73] + - [383, 8547.73] - - [64, 1856, 1, 1280] - - [222, 4409.71] + - [356, 4409.71] - - [64, 1024, 1, 128] - - [166, 554.902] + - [300, 554.902] - - [4288, 2368, 1, 3328] - - [242, 8780.08] + - [376, 8780.08] - - [1856, 2368, 1, 256] - - [249, 4976.74] + - [383, 4976.74] - - [3584, 256, 1, 128] - - [234, 2812.37] + - [368, 2812.37] - - [3584, 6784, 1, 3328] - - [244, 9278.22] + - [378, 9278.22] - - [256, 1024, 1, 256] - - [240, 4346.53] + - [374, 4346.53] - - [4, 6784, 1, 3328] - - [259, 681.366] + - [393, 681.366] - - [1024, 5888, 1, 3328] - - [240, 9187.61] + - [374, 9187.61] - - [1024, 128, 1, 1280] - - [192, 3660.05] + - [326, 3660.05] - - [4288, 128, 1, 1280] - - [246, 6019.17] + - [380, 6019.17] - - [5056, 4288, 1, 1280] - - [238, 9343.96] + - [372, 9343.96] - - [5888, 64, 1, 256] - - [240, 4692.17] + - [374, 4692.17] - - [1856, 256, 1, 1280] - - [246, 4790.38] + - [380, 4790.38] - - [64, 5888, 1, 3328] - - [214, 6702.2] + - [348, 6702.2] - - [2944, 5888, 1, 128] - - [235, 5202.65] + - [369, 5202.65] - - [704, 5888, 1, 1280] - - [240, 9264.29] + - [374, 9264.29] - - [2368, 3584, 1, 128] - - [232, 5053.71] + - [366, 5053.71] - - [6784, 5888, 1, 3328] - - [238, 7926.8] + - [372, 7926.8] - - [704, 1024, 1, 1280] - - [239, 5402.6] + - [373, 5402.6] - - [448, 256, 1, 3328] - - [222, 6124.65] + - [356, 6124.65] - - [448, 1856, 1, 128] - - [233, 2885.96] + - [367, 2885.96] - - [128, 1024, 1, 128] - - [167, 1013.22] + - [301, 1013.22] - - [2944, 4, 1, 128] - - [252, 77.6374] + - [386, 77.6374] - - [1024, 704, 1, 1280] - - [240, 7365.58] + - [374, 7365.58] - - [128, 5888, 1, 256] - - [240, 6990.61] + - [374, 6990.61] - - [1024, 5056, 1, 1280] - - [245, 9422.0] + - [379, 9422.0] - - [4288, 1024, 1, 256] - - [247, 6270.03] + - [381, 6270.03] - - [2944, 2368, 1, 128] - - [232, 4918.18] + - [366, 4918.18] - - [704, 704, 1, 3328] - - [240, 7963.65] + - [374, 7963.65] - - [704, 1408, 1, 1280] - - [240, 8347.32] + - [374, 8347.32] - - [5888, 448, 1, 1280] - - [246, 5217.05] + - [380, 5217.05] - - [3584, 256, 1, 3328] - - [240, 7802.25] + - [374, 7802.25] - - [704, 5888, 1, 3328] - - [246, 8381.46] + - [380, 8381.46] - - [704, 1856, 1, 128] - - [232, 3598.38] + - [366, 3598.38] - - [128, 3584, 1, 3328] - - [202, 7161.11] + - [336, 7161.11] - - [6784, 2368, 1, 1280] - - [251, 9464.41] + - [385, 9464.41] - - [4, 4288, 1, 128] - - [252, 132.68] + - [386, 132.68] - - [128, 704, 1, 1280] - - [214, 4463.85] + - [348, 4463.85] - - [3584, 2944, 1, 256] - - [251, 8201.24] + - [385, 8201.24] - - [1856, 128, 1, 3328] - - [193, 6575.5] + - [327, 6575.5] - - [4, 64, 1, 1280] - - [209, 43.6745] + - [343, 43.6745] - - [4, 5056, 1, 3328] - - [189, 675.315] + - [323, 675.315] - - [128, 2944, 1, 1280] - - [193, 5916.99] + - [327, 5916.99] - - [2368, 1024, 1, 3328] - - [246, 8646.84] + - [380, 8646.84] - - [128, 256, 1, 3328] - - [227, 4130.85] + - [361, 4130.85] - - [1408, 5056, 1, 3328] - - [245, 9529.75] + - [379, 9529.75] - - [1856, 1856, 1, 3328] - - [244, 8114.99] + - [378, 8114.99] - - [3584, 128, 1, 256] - - [240, 5603.18] + - [374, 5603.18] - - [448, 1408, 1, 3328] - - [240, 7073.03] + - [374, 7073.03] - - [2368, 2368, 1, 256] - - [247, 7648.76] + - [381, 7648.76] - - [4288, 4288, 1, 1280] - - [242, 9244.11] + - [376, 9244.11] - - [64, 448, 1, 1280] - - [213, 2885.33] + - [347, 2885.33] - - [1408, 4288, 1, 256] - - [240, 8080.41] + - [374, 8080.41] - - [448, 4, 1, 256] - - [258, 84.4294] + - [392, 84.4294] - - [5888, 448, 1, 128] - - [235, 3540.8] + - [369, 3540.8] - - [448, 4, 1, 1280] - - [209, 322.257] + - [343, 322.257] - - [704, 6784, 1, 3328] - - [239, 8613.58] + - [373, 8613.58] - - [5888, 5888, 1, 1280] - - [246, 9502.05] + - [380, 9502.05] - - [5056, 1024, 1, 1280] - - [249, 9110.11] + - [383, 9110.11] - - [448, 5888, 1, 3328] - - [240, 8586.43] + - [374, 8586.43] - - [128, 4, 1, 128] - - [252, 4.27959] + - [386, 4.27959] - - [1024, 2944, 1, 1280] - - [248, 7096.53] + - [382, 7096.53] - - [5056, 5888, 1, 1280] - - [239, 9693.51] + - [373, 9693.51] - - [4288, 5888, 1, 128] - - [232, 5406.46] + - [366, 5406.46] - - [256, 3584, 1, 256] - - [240, 6908.37] + - [374, 6908.37] - - [1408, 3584, 1, 128] - - [232, 4645.69] + - [366, 4645.69] - - [256, 2944, 1, 3328] - - [243, 6284.4] + - [377, 6284.4] - - [448, 3584, 1, 128] - - [235, 3675.37] + - [369, 3675.37] - - [5888, 2944, 1, 1280] - - [245, 9628.9] + - [379, 9628.9] - - [4, 6784, 1, 1280] - - [189, 688.176] + - [323, 688.176] - - [2368, 5888, 1, 128] - - [232, 5273.96] + - [366, 5273.96] - - [64, 2944, 1, 128] - - [176, 1316.54] + - [310, 1316.54] - - [3584, 5888, 1, 256] - - [246, 9239.14] + - [380, 9239.14] - - [2368, 704, 1, 128] - - [235, 3537.65] + - [369, 3537.65] - - [3584, 2944, 1, 1280] - - [240, 9324.62] + - [374, 9324.62] - - [3584, 2368, 1, 128] - - [232, 4766.34] + - [366, 4766.34] - - [5056, 704, 1, 128] - - [232, 4487.95] + - [366, 4487.95] - - [448, 2368, 1, 128] - - [235, 2877.02] + - [369, 2877.02] - - [5056, 1408, 1, 3328] - - [251, 9515.97] + - [385, 9515.97] - - [1408, 704, 1, 256] - - [243, 6836.18] + - [377, 6836.18] - - [6784, 1024, 1, 3328] - - [238, 9309.65] + - [372, 9309.65] - - [6784, 2944, 1, 3328] - - [239, 9536.58] + - [373, 9536.58] - - [2944, 5056, 1, 3328] - - [240, 9526.25] + - [374, 9526.25] - - [1856, 1856, 1, 256] - - [240, 5239.24] + - [374, 5239.24] - - [1024, 5888, 1, 128] - - [232, 4006.28] + - [366, 4006.28] - - [2048, 7133, 1, 2048] - - [238, 9828.07] + - [372, 9828.07] - - [256, 4, 1, 128] - - [253, 4.38908] + - [387, 4.38908] - - [4288, 5888, 1, 1280] - - [248, 9202.83] + - [382, 9202.83] - - [4288, 4288, 1, 256] - - [243, 5521.18] + - [377, 5521.18] - - [448, 2944, 1, 3328] - - [246, 7724.53] + - [380, 7724.53] - - [4288, 1856, 1, 1280] - - [246, 8826.34] + - [380, 8826.34] - - [1856, 2944, 1, 3328] - - [240, 9194.9] + - [374, 9194.9] - - [256, 6784, 1, 3328] - - [240, 8740.33] + - [374, 8740.33] - - [64, 5888, 1, 256] - - [240, 4766.35] + - [374, 4766.35] - - [256, 5056, 1, 128] - - [232, 2937.6] + - [366, 2937.6] - - [5056, 1024, 1, 256] - - [251, 5467.91] + - [385, 5467.91] - - [704, 64, 1, 3328] - - [228, 4818.43] + - [362, 4818.43] - - [5056, 1856, 1, 3328] - - [245, 8861.69] + - [379, 8861.69] - - [4, 2944, 1, 3328] - - [195, 662.102] + - [329, 662.102] - - [4, 5056, 1, 256] - - [255, 494.121] + - [389, 494.121] - - [1856, 1408, 1, 256] - - [240, 8674.78] + - [374, 8674.78] - - [3584, 4, 1, 128] - - [252, 108.296] + - [386, 108.296] - - [448, 448, 1, 3328] - - [214, 6457.4] + - [348, 6457.4] - - [6784, 128, 1, 3328] - - [207, 7256.71] + - [341, 7256.71] - - [4288, 1408, 1, 128] - - [235, 4791.76] + - [369, 4791.76] - - [4288, 5056, 1, 256] - - [240, 8560.84] + - [374, 8560.84] - - [1408, 128, 1, 1280] - - [222, 5085.79] + - [356, 5085.79] - - [5056, 256, 1, 3328] - - [243, 7284.23] + - [377, 7284.23] - - [704, 704, 1, 256] - - [240, 6171.19] + - [374, 6171.19] - - [1024, 5888, 1, 1280] - - [245, 8852.89] + - [379, 8852.89] - - [6784, 2368, 1, 128] - - [233, 4729.3] + - [367, 4729.3] - - [4, 5056, 1, 1280] - - [206, 670.046] + - [340, 670.046] - - [64, 128, 1, 256] - - [208, 369.317] + - [342, 369.317] - - [128, 1856, 1, 1280] - - [202, 5549.13] + - [336, 5549.13] - - [5056, 3584, 1, 256] - - [246, 7115.84] + - [380, 7115.84] - - [1856, 1024, 1, 1280] - - [238, 8196.5] + - [372, 8196.5] - - [6784, 4288, 1, 1280] - - [239, 9509.66] + - [373, 9509.66] - - [1856, 1856, 1, 1280] - - [241, 5791.99] + - [375, 5791.99] - - [6784, 2944, 1, 128] - - [232, 5317.12] + - [366, 5317.12] - - [1408, 5056, 1, 1280] - - [241, 8980.73] + - [375, 8980.73] - - [4, 2368, 1, 3328] - - [206, 592.634] + - [340, 592.634] - - [5888, 1856, 1, 128] - - [231, 4600.2] + - [365, 4600.2] - - [448, 704, 1, 1280] - - [240, 2286.58] + - [374, 2286.58] - - [2368, 1024, 1, 128] - - [235, 3911.12] + - [369, 3911.12] - - [1024, 448, 1, 3328] - - [240, 7295.24] + - [374, 7295.24] - - [1856, 704, 1, 1280] - - [240, 8881.12] + - [374, 8881.12] - - [5056, 3584, 1, 128] - - [232, 4911.68] + - [366, 4911.68] - - [5888, 5888, 1, 3328] - - [248, 9243.9] + - [382, 9243.9] - - [6784, 1024, 1, 256] - - [251, 5475.41] + - [385, 5475.41] - - [2944, 2368, 1, 256] - - [246, 5670.77] + - [380, 5670.77] - - [256, 448, 1, 256] - - [197, 2293.86] + - [331, 2293.86] - - [5056, 5888, 1, 3328] - - [241, 7848.07] + - [375, 7848.07] - - [1856, 1024, 1, 256] - - [246, 7517.7] + - [380, 7517.7] - - [448, 1408, 1, 1280] - - [240, 6917.54] + - [374, 6917.54] - - [3584, 448, 1, 1280] - - [246, 7980.86] + - [380, 7980.86] - - [1024, 1024, 1, 1280] - - [243, 8384.52] + - [377, 8384.52] - - [448, 5888, 1, 256] - - [240, 7365.75] + - [374, 7365.75] - - [704, 64, 1, 128] - - [185, 358.755] + - [319, 358.755] - - [1408, 6784, 1, 3328] - - [246, 9094.19] + - [380, 9094.19] - - [448, 1024, 1, 128] - - [235, 1773.05] + - [369, 1773.05] - - [4288, 704, 1, 128] - - [232, 4355.38] + - [366, 4355.38] - - [128, 1856, 1, 128] - - [171, 1610.73] + - [305, 1610.73] - - [448, 2368, 1, 3328] - - [246, 7366.47] + - [380, 7366.47] - - [5056, 64, 1, 128] - - [171, 2157.33] + - [305, 2157.33] - - [5056, 2944, 1, 256] - - [240, 9123.16] + - [374, 9123.16] - - [6784, 5888, 1, 128] - - [231, 5285.9] + - [365, 5285.9] - - [704, 1024, 1, 256] - - [246, 6667.35] + - [380, 6667.35] - - [1024, 4, 1, 256] - - [195, 187.346] + - [329, 187.346] - - [2368, 1856, 1, 256] - - [246, 6777.94] + - [380, 6777.94] - - [128, 6784, 1, 1280] - - [243, 7052.71] + - [377, 7052.71] - - [1408, 3584, 1, 3328] - - [247, 9038.05] + - [381, 9038.05] - - [2368, 6784, 1, 256] - - [240, 9181.45] + - [374, 9181.45] - - [5056, 1408, 1, 1280] - - [245, 9422.0] + - [379, 9422.0] - - [256, 256, 1, 128] - - [177, 543.404] + - [311, 543.404] - - [5056, 4288, 1, 128] - - [235, 5340.02] + - [369, 5340.02] - - [1408, 1856, 1, 128] - - [232, 4270.99] + - [366, 4270.99] - - [1408, 5888, 1, 3328] - - [244, 9034.89] + - [378, 9034.89] - - [1856, 256, 1, 256] - - [246, 5847.93] + - [380, 5847.93] - - [6784, 6784, 1, 256] - - [239, 9624.48] + - [373, 9624.48] - - [64, 256, 1, 128] - - [178, 146.549] + - [312, 146.549] - - [4288, 2368, 1, 128] - - [231, 3897.04] + - [365, 3897.04] - - [1856, 4288, 1, 128] - - [232, 4337.17] + - [366, 4337.17] - - [256, 4288, 1, 1280] - - [240, 7499.52] + - [374, 7499.52] - - [2368, 2944, 1, 256] - - [245, 7703.28] + - [379, 7703.28] - - [4, 1856, 1, 256] - - [258, 264.064] + - [392, 264.064] - - [3584, 1856, 1, 1280] - - [240, 9224.43] + - [374, 9224.43] - - [6784, 6784, 1, 128] - - [232, 5476.13] + - [366, 5476.13] - - [256, 1856, 1, 128] - - [235, 1858.82] + - [369, 1858.82] - - [704, 64, 1, 1280] - - [213, 3368.77] + - [347, 3368.77] - - [5888, 5056, 1, 256] - - [246, 5859.91] + - [380, 5859.91] - - [3584, 448, 1, 256] - - [246, 7298.43] + - [380, 7298.43] - - [448, 4288, 1, 128] - - [232, 3813.55] + - [366, 3813.55] - - [2944, 4288, 1, 3328] - - [241, 9149.73] + - [375, 9149.73] - - [256, 6784, 1, 256] - - [240, 7984.95] + - [374, 7984.95] - - [1408, 4288, 1, 128] - - [235, 4728.44] + - [369, 4728.44] - - [2944, 704, 1, 3328] - - [246, 7149.86] + - [380, 7149.86] - - [128, 448, 1, 256] - - [212, 1699.18] + - [346, 1699.18] - - [512, 32, 1, 512] - - [212, 1127.6] + - [346, 1127.6] - - [3584, 3584, 1, 256] - - [241, 8558.11] + - [375, 8558.11] - - [448, 1408, 1, 128] - - [232, 2504.45] + - [366, 2504.45] - - [128, 256, 1, 1280] - - [213, 3216.59] + - [347, 3216.59] - - [3584, 5056, 1, 256] - - [238, 5674.45] + - [372, 5674.45] - - [6784, 128, 1, 256] - - [240, 6216.49] + - [374, 6216.49] - - [4288, 4, 1, 256] - - [256, 435.706] + - [390, 435.706] - - [64, 1408, 1, 3328] - - [214, 6186.01] + - [348, 6186.01] - - [704, 448, 1, 256] - - [246, 4005.08] + - [380, 4005.08] - - [2944, 2368, 1, 1280] - - [247, 8542.8] + - [381, 8542.8] - - [448, 64, 1, 3328] - - [227, 3835.33] + - [361, 3835.33] - - [1408, 3584, 1, 256] - - [240, 8714.63] + - [374, 8714.63] - - [3584, 4, 1, 3328] - - [195, 689.554] + - [329, 689.554] - - [6784, 3584, 1, 256] - - [245, 9271.34] + - [379, 9271.34] - - [256, 128, 1, 128] - - [178, 283.499] + - [312, 283.499] - - [704, 1408, 1, 128] - - [232, 3210.57] + - [366, 3210.57] - - [4, 2368, 1, 256] - - [258, 360.938] + - [392, 360.938] - - [2944, 448, 1, 128] - - [232, 3344.41] + - [366, 3344.41] - - [128, 1408, 1, 256] - - [240, 3186.38] + - [374, 3186.38] - - [4, 2944, 1, 256] - - [256, 384.622] + - [390, 384.622] - - [64, 128, 1, 3328] - - [209, 2103.72] + - [343, 2103.72] - - [5056, 2368, 1, 128] - - [232, 5219.76] + - [366, 5219.76] - - [2944, 2944, 1, 3328] - - [249, 9174.69] + - [383, 9174.69] - - [5056, 6784, 1, 256] - - [251, 8992.36] + - [385, 8992.36] - - [1856, 3584, 1, 128] - - [232, 4957.27] + - [366, 4957.27] - - [128, 2944, 1, 128] - - [170, 2241.48] + - [304, 2241.48] - - [1024, 704, 1, 3328] - - [250, 6545.11] + - [384, 6545.11] - - [6784, 448, 1, 256] - - [246, 5379.25] + - [380, 5379.25] - - [3584, 6784, 1, 128] - - [232, 5102.01] + - [366, 5102.01] - - [128, 4288, 1, 256] - - [240, 5211.86] + - [374, 5211.86] - - [704, 448, 1, 3328] - - [241, 4504.15] + - [375, 4504.15] - - [1024, 1024, 1, 3328] - - [243, 8009.77] + - [377, 8009.77] - - [128, 128, 1, 3328] - - [226, 3185.03] + - [360, 3185.03] - - [5056, 1856, 1, 256] - - [240, 9138.43] + - [374, 9138.43] - - [256, 128, 1, 256] - - [212, 1205.36] + - [346, 1205.36] - - [1024, 1856, 1, 256] - - [251, 6375.09] + - [385, 6375.09] - - [4288, 64, 1, 128] - - [168, 1695.43] + - [302, 1695.43] - - [256, 448, 1, 3328] - - [215, 5659.67] + - [349, 5659.67] - - [1408, 6784, 1, 1280] - - [240, 9349.2] + - [374, 9349.2] - - [3584, 3584, 1, 1280] - - [245, 9302.19] + - [379, 9302.19] - - [64, 2368, 1, 1280] - - [214, 4433.07] + - [348, 4433.07] - - [448, 2368, 1, 1280] - - [240, 7250.77] + - [374, 7250.77] - - [5888, 5888, 1, 128] - - [232, 4616.03] + - [366, 4616.03] - - [64, 6784, 1, 3328] - - [246, 6987.23] + - [380, 6987.23] - - [2944, 256, 1, 1280] - - [249, 6127.45] + - [383, 6127.45] - - [5056, 5888, 1, 128] - - [231, 5106.39] + - [365, 5106.39] - - [256, 2368, 1, 128] - - [232, 2141.23] + - [366, 2141.23] - - [5056, 2368, 1, 3328] - - [243, 9041.75] + - [377, 9041.75] - - [2944, 4288, 1, 256] - - [251, 8691.22] + - [385, 8691.22] - - [1408, 3584, 1, 1280] - - [240, 9070.0] + - [374, 9070.0] - - [2368, 64, 1, 256] - - [212, 2412.87] + - [346, 2412.87] - - [64, 448, 1, 3328] - - [227, 3739.14] + - [361, 3739.14] - - [256, 256, 1, 3328] - - [214, 5304.18] + - [348, 5304.18] - - [5888, 4, 1, 128] - - [253, 105.655] + - [387, 105.655] - - [1856, 704, 1, 256] - - [240, 8025.43] + - [374, 8025.43] - - [4, 4288, 1, 1280] - - [187, 579.07] + - [321, 579.07] - - [1408, 448, 1, 3328] - - [248, 5714.51] + - [382, 5714.51] - - [1024, 4, 1, 3328] - - [206, 608.649] + - [340, 608.649] - - [2368, 256, 1, 256] - - [246, 5173.08] + - [380, 5173.08] - - [2368, 6784, 1, 3328] - - [246, 9456.61] + - [380, 9456.61] - - [1856, 1408, 1, 1280] - - [251, 7805.19] + - [385, 7805.19] - - [1856, 448, 1, 1280] - - [238, 6185.04] + - [372, 6185.04] - - [6784, 704, 1, 128] - - [232, 4597.87] + - [366, 4597.87] - - [4, 4, 1, 256] - - [209, 0.791892] + - [343, 0.791892] - - [128, 5888, 1, 128] - - [170, 2691.76] + - [304, 2691.76] - - [1408, 5888, 1, 256] - - [245, 7164.27] + - [379, 7164.27] - - [704, 2944, 1, 1280] - - [247, 8139.81] + - [381, 8139.81] - - [1856, 2368, 1, 128] - - [235, 4623.38] + - [369, 4623.38] - - [4096, 7133, 1, 4096] - - [239, 9940.07] + - [373, 9940.07] - - [256, 64, 1, 256] - - [203, 689.953] + - [337, 689.953] - - [1024, 1024, 1, 256] - - [246, 7216.11] + - [380, 7216.11] - - [704, 1856, 1, 256] - - [246, 6364.17] + - [380, 6364.17] - - [128, 4288, 1, 3328] - - [202, 7200.59] + - [336, 7200.59] - - [3584, 704, 1, 1280] - - [249, 7972.08] + - [383, 7972.08] - - [256, 128, 1, 1280] - - [200, 2702.62] + - [334, 2702.62] - - [2368, 4, 1, 256] - - [195, 326.018] + - [329, 326.018] - - [256, 2368, 1, 1280] - - [240, 6638.93] + - [374, 6638.93] - - [2944, 6784, 1, 128] - - [231, 5233.53] + - [365, 5233.53] - - [3584, 448, 1, 3328] - - [240, 8094.4] + - [374, 8094.4] - - [1408, 4, 1, 256] - - [258, 243.646] + - [392, 243.646] - - [704, 2368, 1, 3328] - - [240, 8403.11] + - [374, 8403.11] - - [2944, 448, 1, 256] - - [240, 7022.59] + - [374, 7022.59] - - [1856, 448, 1, 128] - - [235, 2842.79] + - [369, 2842.79] - - [2368, 128, 1, 1280] - - [222, 5685.52] + - [356, 5685.52] - - [256, 5888, 1, 128] - - [237, 2178.71] + - [371, 2178.71] - - [64, 6784, 1, 256] - - [240, 5385.23] + - [374, 5385.23] - - [64, 5056, 1, 1280] - - [214, 5603.29] + - [348, 5603.29] - - [4, 6784, 1, 128] - - [252, 180.256] + - [386, 180.256] - - [2944, 2944, 1, 1280] - - [249, 9129.39] + - [383, 9129.39] - - [5888, 2368, 1, 256] - - [251, 6961.69] + - [385, 6961.69] - - [4, 3584, 1, 1280] - - [195, 646.23] + - [329, 646.23] - - [1408, 128, 1, 128] - - [181, 1172.29] + - [315, 1172.29] - - [6784, 704, 1, 3328] - - [246, 9084.62] + - [380, 9084.62] - - [128, 64, 1, 1280] - - [225, 1260.41] + - [359, 1260.41] - - [2368, 256, 1, 1280] - - [246, 6643.48] + - [380, 6643.48] - - [4, 448, 1, 3328] - - [209, 433.514] + - [343, 433.514] - - [5888, 4288, 1, 128] - - [233, 4753.17] + - [367, 4753.17] - - [4, 5888, 1, 256] - - [195, 471.14] + - [329, 471.14] - - [1408, 2944, 1, 3328] - - [249, 9207.1] + - [383, 9207.1] - - [3584, 704, 1, 128] - - [235, 3762.46] + - [369, 3762.46] - - [64, 1024, 1, 256] - - [213, 1807.99] + - [347, 1807.99] - - [5056, 5056, 1, 128] - - [236, 4830.16] + - [370, 4830.16] - - [2368, 448, 1, 1280] - - [240, 7263.16] + - [374, 7263.16] - - [128, 3584, 1, 256] - - [243, 4369.17] + - [377, 4369.17] - - [704, 448, 1, 1280] - - [241, 4205.33] + - [375, 4205.33] - - [448, 5056, 1, 128] - - [232, 3855.57] + - [366, 3855.57] - - [256, 4, 1, 1280] - - [263, 157.638] + - [397, 157.638] - - [128, 5056, 1, 256] - - [246, 6109.06] + - [380, 6109.06] - - [1408, 5056, 1, 128] - - [235, 4836.68] + - [369, 4836.68] - - [2944, 3584, 1, 128] - - [235, 4532.19] + - [369, 4532.19] - - [3584, 2368, 1, 256] - - [240, 8951.34] + - [374, 8951.34] - - [5888, 5056, 1, 1280] - - [251, 9276.49] + - [385, 9276.49] - - [2368, 5056, 1, 128] - - [235, 5167.66] + - [369, 5167.66] - - [64, 704, 1, 256] - - [195, 1501.97] + - [329, 1501.97] - - [4288, 256, 1, 1280] - - [240, 7496.3] + - [374, 7496.3] - - [3584, 3584, 1, 3328] - - [241, 9301.77] + - [375, 9301.77] - - [1024, 256, 1, 128] - - [232, 1508.84] + - [366, 1508.84] - - [4, 704, 1, 128] - - [253, 12.1469] + - [387, 12.1469] - - [5888, 6784, 1, 256] - - [239, 9370.47] + - [373, 9370.47] - - [4288, 2944, 1, 3328] - - [243, 9149.09] + - [377, 9149.09] - - [2944, 64, 1, 128] - - [179, 1456.46] + - [313, 1456.46] - - [1856, 64, 1, 256] - - [205, 2210.03] + - [339, 2210.03] - - [4288, 128, 1, 3328] - - [199, 6471.95] + - [333, 6471.95] - - [4288, 704, 1, 1280] - - [246, 8934.61] + - [380, 8934.61] - - [256, 5056, 1, 1280] - - [240, 8439.13] + - [374, 8439.13] - - [1408, 256, 1, 128] - - [235, 1769.17] + - [369, 1769.17] - - [2944, 5888, 1, 3328] - - [240, 9448.04] + - [374, 9448.04] - - [6784, 5888, 1, 1280] - - [251, 9372.25] + - [385, 9372.25] - - [704, 128, 1, 256] - - [197, 2059.8] + - [331, 2059.8] - - [5888, 4288, 1, 1280] - - [243, 9244.32] + - [377, 9244.32] - - [448, 256, 1, 1280] - - [222, 4741.72] + - [356, 4741.72] - - [5888, 3584, 1, 128] - - [231, 4980.06] + - [365, 4980.06] - - [1856, 1856, 1, 128] - - [235, 4363.98] + - [369, 4363.98] - - [5056, 4, 1, 1280] - - [255, 629.641] + - [389, 629.641] - - [256, 1408, 1, 1280] - - [246, 5588.44] + - [380, 5588.44] - - [512, 16, 1, 512] - - [206, 689.953] + - [340, 689.953] - - [704, 3584, 1, 128] - - [235, 4069.67] + - [369, 4069.67] - - [5888, 448, 1, 3328] - - [251, 7925.94] + - [385, 7925.94] - - [2368, 4288, 1, 1280] - - [250, 8492.7] + - [384, 8492.7] - - [4288, 2944, 1, 128] - - [232, 5238.21] + - [366, 5238.21] - - [1024, 6784, 1, 3328] - - [246, 8578.18] + - [380, 8578.18] - - [128, 2368, 1, 256] - - [246, 3788.9] + - [380, 3788.9] - - [6784, 64, 1, 3328] - - [240, 7003.46] + - [374, 7003.46] - - [5056, 2944, 1, 3328] - - [243, 8575.45] + - [377, 8575.45] - - [448, 128, 1, 256] - - [195, 1715.06] + - [329, 1715.06] - - [2944, 3584, 1, 256] - - [240, 8994.26] + - [374, 8994.26] - - [1408, 1408, 1, 3328] - - [238, 8757.7] + - [372, 8757.7] - - [1856, 128, 1, 1280] - - [240, 5598.17] + - [374, 5598.17] - - [3584, 3584, 1, 128] - - [231, 4787.44] + - [365, 4787.44] - - [64, 3584, 1, 256] - - [246, 3546.01] + - [380, 3546.01] - - [1408, 4, 1, 3328] - - [190, 640.24] + - [324, 640.24] - - [128, 2944, 1, 3328] - - [214, 7204.24] + - [348, 7204.24] - - [3584, 704, 1, 256] - - [240, 6239.69] + - [374, 6239.69] - - [2944, 448, 1, 3328] - - [246, 7726.71] + - [380, 7726.71] - - [3584, 1408, 1, 3328] - - [238, 9358.78] + - [372, 9358.78] - - [704, 3584, 1, 1280] - - [246, 8005.28] + - [380, 8005.28] - - [2944, 6784, 1, 1280] - - [238, 9487.73] + - [372, 9487.73] - - [1856, 6784, 1, 256] - - [240, 5684.56] + - [374, 5684.56] - - [4288, 448, 1, 3328] - - [246, 8410.38] + - [380, 8410.38] - - [6784, 4288, 1, 128] - - [236, 4785.58] + - [370, 4785.58] - - [6784, 704, 1, 1280] - - [240, 5579.05] + - [374, 5579.05] - - [256, 4288, 1, 256] - - [240, 6781.43] + - [374, 6781.43] - - [3584, 64, 1, 128] - - [179, 1474.0] + - [313, 1474.0] - - [5888, 1024, 1, 3328] - - [238, 8639.49] + - [372, 8639.49] - - [448, 64, 1, 128] - - [170, 259.282] + - [304, 259.282] - - [704, 6784, 1, 1280] - - [246, 9027.25] + - [380, 9027.25] - - [5888, 128, 1, 256] - - [246, 6812.88] + - [380, 6812.88] - - [2368, 448, 1, 3328] - - [246, 7356.63] + - [380, 7356.63] - - [1856, 5056, 1, 3328] - - [245, 8871.56] + - [379, 8871.56] - - [4, 6784, 1, 256] - - [254, 469.479] + - [388, 469.479] - - [1024, 3584, 1, 128] - - [232, 3428.02] + - [366, 3428.02] - - [1024, 1408, 1, 128] - - [235, 2935.05] + - [369, 2935.05] - - [2368, 2944, 1, 128] - - [235, 4888.02] + - [369, 4888.02] - - [5056, 64, 1, 256] - - [204, 3186.16] + - [338, 3186.16] - - [4, 448, 1, 1280] - - [209, 273.167] + - [343, 273.167] - - [5056, 2944, 1, 128] - - [236, 4752.79] + - [370, 4752.79] - - [5888, 5056, 1, 3328] - - [250, 9124.77] + - [384, 9124.77] - - [1024, 704, 1, 128] - - [235, 2302.36] + - [369, 2302.36] - - [1408, 2368, 1, 128] - - [235, 3826.95] + - [369, 3826.95] - - [5888, 2368, 1, 128] - - [232, 4912.77] + - [366, 4912.77] - - [128, 5056, 1, 3328] - - [222, 7583.8] + - [356, 7583.8] - - [3584, 6784, 1, 1280] - - [249, 9313.5] + - [383, 9313.5] - - [3072, 7435, 1, 1024] - - [243, 9322.07] + - [377, 9322.07] - - [1856, 5888, 1, 256] - - [240, 5778.34] + - [374, 5778.34] - - [256, 256, 1, 256] - - [192, 1576.91] + - [326, 1576.91] - - [256, 64, 1, 128] - - [178, 173.705] + - [312, 173.705] - - [4288, 4288, 1, 3328] - - [245, 8416.27] + - [379, 8416.27] - - [4288, 1408, 1, 1280] - - [251, 9301.97] + - [385, 9301.97] - - [3584, 5056, 1, 128] - - [237, 4344.94] + - [371, 4344.94] - - [4, 1024, 1, 3328] - - [206, 615.239] + - [340, 615.239] - - [4288, 2368, 1, 256] - - [240, 9142.67] + - [374, 9142.67] - - [2944, 5056, 1, 1280] - - [240, 9399.69] + - [374, 9399.69] - - [448, 6784, 1, 256] - - [239, 5710.93] + - [373, 5710.93] - - [64, 1024, 1, 3328] - - [222, 4975.1] + - [356, 4975.1] - - [6784, 2368, 1, 3328] - - [249, 9207.63] + - [383, 9207.63] - - [256, 1024, 1, 1280] - - [246, 5983.42] + - [380, 5983.42] - - [704, 4, 1, 128] - - [252, 15.1187] + - [386, 15.1187] - - [256, 4, 1, 256] - - [209, 52.9516] + - [343, 52.9516] - - [4288, 128, 1, 256] - - [240, 5242.98] + - [374, 5242.98] - - [4288, 1856, 1, 3328] - - [251, 9354.06] + - [385, 9354.06] - - [3584, 448, 1, 128] - - [232, 3353.9] + - [366, 3353.9] - - [256, 4, 1, 3328] - - [263, 313.324] + - [397, 313.324] - - [4, 1408, 1, 1280] - - [206, 509.207] + - [340, 509.207] - - [3584, 64, 1, 1280] - - [194, 5198.42] + - [328, 5198.42] - - [1408, 448, 1, 128] - - [232, 2628.37] + - [366, 2628.37] - - [3584, 1024, 1, 1280] - - [246, 8535.01] + - [380, 8535.01] - - [1856, 5056, 1, 256] - - [238, 8184.49] + - [372, 8184.49] - - [4, 3584, 1, 256] - - [256, 395.576] + - [390, 395.576] - - [1024, 4288, 1, 256] - - [241, 5966.52] + - [375, 5966.52] - - [5888, 3584, 1, 3328] - - [244, 9189.43] + - [378, 9189.43] - - [4, 256, 1, 256] - - [260, 41.5785] + - [394, 41.5785] - - [5056, 3584, 1, 3328] - - [245, 9431.92] + - [379, 9431.92] - - [128, 5888, 1, 1280] - - [240, 8192.1] + - [374, 8192.1] - - [704, 448, 1, 128] - - [232, 1510.96] + - [366, 1510.96] - - [2368, 1408, 1, 1280] - - [240, 8415.65] + - [374, 8415.65] - - [5056, 2944, 1, 1280] - - [251, 9294.77] + - [385, 9294.77] - - [4, 4, 1, 128] - - [253, 0.1356549] + - [387, 0.1356549] - - [3584, 256, 1, 256] - - [240, 6749.55] + - [374, 6749.55] - - [128, 1856, 1, 3328] - - [193, 6797.09] + - [327, 6797.09] - - [1024, 6784, 1, 256] - - [246, 8783.09] + - [380, 8783.09] - - [4, 128, 1, 256] - - [206, 27.4067] + - [340, 27.4067] - - [64, 64, 1, 1280] - - [225, 712.448] + - [359, 712.448] - - [6784, 4, 1, 128] - - [253, 122.06] + - [387, 122.06] - - [2944, 1408, 1, 128] - - [235, 4430.46] + - [369, 4430.46] - - [448, 128, 1, 3328] - - [222, 5097.34] + - [356, 5097.34] - - [64, 2944, 1, 3328] - - [222, 6362.2] + - [356, 6362.2] - - [64, 4288, 1, 3328] - - [222, 6565.01] + - [356, 6565.01] - - [5056, 6784, 1, 3328] - - [246, 8121.18] + - [380, 8121.18] - - [128, 2944, 1, 256] - - [240, 4692.17] + - [374, 4692.17] - - [128, 6784, 1, 128] - - [169, 2687.46] + - [303, 2687.46] - - [3584, 4288, 1, 256] - - [246, 9193.99] + - [380, 9193.99] - - [448, 1856, 1, 256] - - [246, 6231.39] + - [380, 6231.39] - - [1856, 6784, 1, 3328] - - [251, 9191.48] + - [385, 9191.48] - - [3584, 128, 1, 3328] - - [240, 7368.47] + - [374, 7368.47] - - [64, 1856, 1, 256] - - [191, 2184.63] + - [325, 2184.63] - - [1024, 448, 1, 1280] - - [246, 6977.32] + - [380, 6977.32] - - [5888, 4288, 1, 256] - - [246, 5780.5] + - [380, 5780.5] - - [4, 448, 1, 128] - - [253, 9.06] + - [387, 9.06] - - [5056, 1408, 1, 256] - - [240, 5601.35] + - [374, 5601.35] - - [64, 256, 1, 1280] - - [206, 1927.63] + - [340, 1927.63] - - [3584, 1024, 1, 256] - - [251, 7542.84] + - [385, 7542.84] - - [256, 704, 1, 256] - - [240, 2957.62] + - [374, 2957.62] - - [5888, 5888, 1, 256] - - [251, 7344.14] + - [385, 7344.14] - - [4288, 1024, 1, 1280] - - [246, 8925.84] + - [380, 8925.84] - - [5888, 128, 1, 3328] - - [240, 8410.07] + - [374, 8410.07] - - [448, 6784, 1, 3328] - - [240, 8862.56] + - [374, 8862.56] - - [2944, 1408, 1, 1280] - - [251, 7478.93] + - [385, 7478.93] - - [1024, 32, 1, 512] - - [195, 1777.35] + - [329, 1777.35] - - [2944, 1856, 1, 3328] - - [240, 9153.43] + - [374, 9153.43] - - [2368, 64, 1, 128] - - [179, 1102.3] + - [313, 1102.3] - - [2944, 2944, 1, 128] - - [231, 4591.95] + - [365, 4591.95] - - [4, 128, 1, 3328] - - [261, 119.09] + - [395, 119.09] - - [3584, 5888, 1, 1280] - - [240, 9222.49] + - [374, 9222.49] - - [64, 4, 1, 128] - - [252, 1.03516] + - [386, 1.03516] - - [6784, 1856, 1, 1280] - - [240, 9136.07] + - [374, 9136.07] - - [2944, 5056, 1, 256] - - [246, 8860.13] + - [380, 8860.13] - - [2944, 5888, 1, 1280] - - [239, 9643.63] + - [373, 9643.63] - - [5888, 256, 1, 3328] - - [246, 8799.53] + - [380, 8799.53] - - [1856, 5888, 1, 3328] - - [246, 9457.53] + - [380, 9457.53] - - [3584, 1408, 1, 256] - - [246, 8672.53] + - [380, 8672.53] - - [704, 3584, 1, 3328] - - [246, 8525.3] + - [380, 8525.3] - - [5056, 448, 1, 1280] - - [246, 8843.77] + - [380, 8843.77] - - [3584, 1856, 1, 3328] - - [238, 8881.53] + - [372, 8881.53] - - [64, 1408, 1, 128] - - [167, 747.142] + - [301, 747.142] - - [1408, 704, 1, 1280] - - [240, 8342.93] + - [374, 8342.93] - - [2944, 1024, 1, 256] - - [251, 8079.58] + - [385, 8079.58] - - [1024, 2368, 1, 128] - - [235, 3347.58] + - [369, 3347.58] - - [2368, 4288, 1, 3328] - - [246, 9467.67] + - [380, 9467.67] - - [4, 1408, 1, 256] - - [258, 257.563] + - [392, 257.563] - - [1024, 1408, 1, 1280] - - [246, 8241.84] + - [380, 8241.84] - - [64, 64, 1, 256] - - [206, 190.059] + - [340, 190.059] - - [704, 256, 1, 3328] - - [240, 4519.28] + - [374, 4519.28] - - [6784, 5056, 1, 256] - - [239, 9133.78] + - [373, 9133.78] - - [4, 4288, 1, 3328] - - [190, 670.075] + - [324, 670.075] - - [448, 6784, 1, 128] - - [232, 4481.92] + - [366, 4481.92] - - [4, 704, 1, 3328] - - [262, 523.071] + - [396, 523.071] - - [448, 2944, 1, 256] - - [240, 7022.59] + - [374, 7022.59] - - [2944, 6784, 1, 256] - - [246, 9199.84] + - [380, 9199.84] - - [2368, 2368, 1, 1280] - - [251, 8646.84] + - [385, 8646.84] - - [4, 4, 1, 1280] - - [209, 3.11176] + - [343, 3.11176] - - [1856, 3584, 1, 1280] - - [238, 8805.45] + - [372, 8805.45] - - [64, 2944, 1, 256] - - [212, 2565.76] + - [346, 2565.76] - - [3584, 1408, 1, 1280] - - [251, 9273.12] + - [385, 9273.12] - - [448, 256, 1, 128] - - [167, 941.13] + - [301, 941.13] - - [4288, 448, 1, 128] - - [233, 3215.2] + - [367, 3215.2] - - [5056, 256, 1, 1280] - - [246, 8790.13] + - [380, 8790.13] - - [1856, 1408, 1, 3328] - - [240, 9310.73] + - [374, 9310.73] - - [128, 128, 1, 128] - - [175, 155.215] + - [309, 155.215] - - [1024, 4288, 1, 3328] - - [243, 8528.12] + - [377, 8528.12] - - [448, 2368, 1, 256] - - [247, 5097.34] + - [381, 5097.34] - - [1024, 4, 1, 128] - - [253, 10.3721] + - [387, 10.3721] - - [5056, 448, 1, 256] - - [246, 8236.78] + - [380, 8236.78] - - [2944, 2368, 1, 3328] - - [239, 9331.16] + - [373, 9331.16] - - [704, 128, 1, 3328] - - [214, 5969.3] + - [348, 5969.3] - - [64, 64, 1, 3328] - - [230, 1494.78] + - [364, 1494.78] - - [1024, 1856, 1, 1280] - - [245, 6356.43] + - [379, 6356.43] - - [6784, 1856, 1, 256] - - [246, 9068.63] + - [380, 9068.63] - - [128, 2368, 1, 3328] - - [222, 6714.22] + - [356, 6714.22] - - [1024, 5888, 1, 256] - - [246, 5501.6] + - [380, 5501.6] - - [5056, 128, 1, 1280] - - [202, 6455.64] + - [336, 6455.64] - - [5056, 64, 1, 3328] - - [207, 6703.81] + - [341, 6703.81] - - [128, 704, 1, 128] - - [168, 696.618] + - [302, 696.618] - - [1408, 2368, 1, 256] - - [240, 8667.25] + - [374, 8667.25] - - [1408, 1408, 1, 256] - - [251, 7615.81] + - [385, 7615.81] - - [4, 64, 1, 128] - - [253, 1.08463] + - [387, 1.08463] - - [64, 128, 1, 1280] - - [225, 1379.81] + - [359, 1379.81] - - [2368, 2368, 1, 128] - - [235, 4582.26] + - [369, 4582.26] - - [64, 5888, 1, 128] - - [168, 2086.37] + - [302, 2086.37] - - [5888, 4, 1, 3328] - - [189, 667.514] + - [323, 667.514] - - [6784, 1408, 1, 128] - - [236, 4516.34] + - [370, 4516.34] - - [4288, 5888, 1, 256] - - [251, 8497.43] + - [385, 8497.43] - - [1408, 5056, 1, 256] - - [240, 8867.46] + - [374, 8867.46] - - [5056, 128, 1, 3328] - - [222, 7678.98] + - [356, 7678.98] - - [128, 128, 1, 1280] - - [210, 2016.59] + - [344, 2016.59] - - [448, 704, 1, 256] - - [241, 3030.89] + - [375, 3030.89] - - [4288, 3584, 1, 128] - - [232, 5246.33] + - [366, 5246.33] - - [2944, 128, 1, 3328] - - [207, 6795.16] + - [341, 6795.16] - - [128, 5056, 1, 1280] - - [193, 6193.09] + - [327, 6193.09] - - [3584, 5056, 1, 1280] - - [245, 9499.17] + - [379, 9499.17] - - [256, 448, 1, 1280] - - [201, 4267.56] + - [335, 4267.56] - - [704, 704, 1, 128] - - [235, 2259.32] + - [369, 2259.32] - - [5056, 4, 1, 128] - - [253, 12.5313] + - [387, 12.5313] - - [704, 256, 1, 1280] - - [240, 4355.97] + - [374, 4355.97] - - [64, 2368, 1, 3328] - - [214, 6310.97] + - [348, 6310.97] - - [1856, 1024, 1, 128] - - [231, 4065.43] + - [365, 4065.43] - - [1856, 64, 1, 128] - - [170, 936.329] + - [304, 936.329] - - [64, 6784, 1, 1280] - - [193, 5731.8] + - [327, 5731.8] - - [704, 4288, 1, 256] - - [246, 5218.9] + - [380, 5218.9] - - [5888, 2368, 1, 1280] - - [240, 9378.9] + - [374, 9378.9] - - [128, 256, 1, 256] - - [210, 1219.37] + - [344, 1219.37] - - [256, 64, 1, 1280] - - [212, 1820.54] + - [346, 1820.54] - - [2368, 5888, 1, 1280] - - [251, 9143.64] + - [385, 9143.64] - - [5888, 256, 1, 1280] - - [240, 8678.47] + - [374, 8678.47] - - [4, 5888, 1, 1280] - - [187, 668.242] + - [321, 668.242] - - [704, 128, 1, 128] - - [175, 649.556] + - [309, 649.556] - - [1024, 4, 1, 1280] - - [206, 478.465] + - [340, 478.465] - - [2368, 1856, 1, 3328] - - [238, 8153.87] + - [372, 8153.87] - - [2368, 128, 1, 128] - - [173, 1858.21] + - [307, 1858.21] - - [2944, 704, 1, 256] - - [240, 8438.07] + - [374, 8438.07] - - [5056, 128, 1, 128] - - [169, 2689.63] + - [303, 2689.63] - - [256, 704, 1, 3328] - - [240, 4541.18] + - [374, 4541.18] - - [704, 3584, 1, 256] - - [241, 7771.07] + - [375, 7771.07] - - [1024, 1024, 1, 1024] - - [246, 8305.62] + - [380, 8305.62] - - [704, 2944, 1, 3328] - - [246, 9166.48] + - [380, 9166.48] - - [6784, 1024, 1, 128] - - [231, 4362.31] + - [365, 4362.31] - - [256, 448, 1, 128] - - [178, 899.614] + - [312, 899.614] - - [448, 1024, 1, 3328] - - [240, 7385.56] + - [374, 7385.56] - - [2944, 1024, 1, 3328] - - [243, 8779.81] + - [377, 8779.81] - - [2944, 5056, 1, 128] - - [235, 5103.11] + - [369, 5103.11] - - [1408, 6784, 1, 256] - - [246, 8346.89] + - [380, 8346.89] - - [6784, 1408, 1, 3328] - - [242, 8878.4] + - [376, 8878.4] - - [4288, 6784, 1, 128] - - [231, 5432.99] + - [365, 5432.99] - - [704, 64, 1, 256] - - [220, 1441.89] + - [354, 1441.89] - - [5888, 4, 1, 1280] - - [257, 636.641] + - [391, 636.641] - - [256, 2368, 1, 3328] - - [240, 6804.8] + - [374, 6804.8] - - [6784, 2944, 1, 1280] - - [239, 9472.26] + - [373, 9472.26] - - [4288, 1856, 1, 128] - - [235, 4886.38] + - [369, 4886.38] - - [1856, 2944, 1, 128] - - [232, 4642.96] + - [366, 4642.96] - - [6784, 448, 1, 128] - - [232, 4369.17] + - [366, 4369.17] - - [64, 3584, 1, 128] - - [179, 1645.85] + - [313, 1645.85] - - [448, 5056, 1, 1280] - - [240, 8553.64] + - [374, 8553.64] - - [2368, 1856, 1, 128] - - [232, 4741.85] + - [366, 4741.85] - - [128, 448, 1, 1280] - - [222, 3745.01] + - [356, 3745.01] - - [4288, 704, 1, 256] - - [240, 8444.16] + - [374, 8444.16] - - [256, 3584, 1, 128] - - [232, 2454.96] + - [366, 2454.96] - - [5888, 704, 1, 256] - - [240, 8819.57] + - [374, 8819.57] - - [3584, 1024, 1, 128] - - [235, 4094.96] + - [369, 4094.96] - - [256, 5888, 1, 3328] - - [249, 8538.33] + - [383, 8538.33] - - [1408, 4288, 1, 3328] - - [251, 9212.57] + - [385, 9212.57] - - [6784, 4288, 1, 256] - - [239, 9163.12] + - [373, 9163.12] - - [4288, 256, 1, 128] - - [232, 3081.44] + - [366, 3081.44] - - [5888, 256, 1, 256] - - [240, 7680.75] + - [374, 7680.75] - - [6784, 1024, 1, 1280] - - [251, 9248.63] + - [385, 9248.63] - - [5888, 1024, 1, 128] - - [235, 4061.94] + - [369, 4061.94] - - [1024, 128, 1, 256] - - [246, 2317.39] + - [380, 2317.39] - - [128, 64, 1, 3328] - - [229, 2116.79] + - [363, 2116.79] - - [448, 64, 1, 256] - - [212, 1079.52] + - [346, 1079.52] - - [2368, 256, 1, 128] - - [233, 2229.83] + - [367, 2229.83] - - [6784, 3584, 1, 1280] - - [246, 9096.6] + - [380, 9096.6] - - [1024, 6784, 1, 1280] - - [244, 9112.9] + - [378, 9112.9] - - [2944, 64, 1, 1280] - - [202, 4983.0] + - [336, 4983.0] - - [1408, 2944, 1, 1280] - - [241, 9131.63] + - [375, 9131.63] - - [256, 1856, 1, 256] - - [249, 4432.86] + - [383, 4432.86] - - [1408, 2368, 1, 3328] - - [249, 8449.18] + - [383, 8449.18] - - [2944, 4, 1, 3328] - - [195, 673.94] + - [329, 673.94] - - [128, 1408, 1, 3328] - - [214, 6582.47] + - [348, 6582.47] - - [2944, 1856, 1, 128] - - [232, 4827.54] + - [366, 4827.54] - - [256, 2944, 1, 128] - - [235, 2416.66] + - [369, 2416.66] - - [256, 6784, 1, 128] - - [235, 3118.76] + - [369, 3118.76] - - [2368, 4, 1, 128] - - [253, 22.7197] + - [387, 22.7197] - - [1408, 256, 1, 3328] - - [240, 3733.82] + - [374, 3733.82] - - [1856, 4, 1, 128] - - [252, 7.20009] + - [386, 7.20009] - - [1024, 16, 1, 512] - - [208, 1165.18] + - [342, 1165.18] - - [5056, 6784, 1, 128] - - [236, 4949.13] + - [370, 4949.13] - - [4288, 5056, 1, 128] - - [235, 4966.9] + - [369, 4966.9] - - [1856, 5888, 1, 128] - - [231, 4351.76] + - [365, 4351.76] - - [2944, 5888, 1, 256] - - [251, 8460.99] + - [385, 8460.99] - - [3584, 1856, 1, 256] - - [246, 8876.7] + - [380, 8876.7] - - [4288, 3584, 1, 1280] - - [239, 9603.7] + - [373, 9603.7] - - [2368, 448, 1, 256] - - [240, 6604.7] + - [374, 6604.7] - - [4288, 256, 1, 3328] - - [240, 7619.89] + - [374, 7619.89] - - [1856, 704, 1, 128] - - [232, 3629.61] + - [366, 3629.61] - - [1408, 64, 1, 256] - - [196, 2168.21] + - [330, 2168.21] - - [64, 1856, 1, 128] - - [172, 979.762] + - [306, 979.762] - - [4, 256, 1, 128] - - [253, 5.23595] + - [387, 5.23595] - - [704, 4288, 1, 3328] - - [246, 9014.52] + - [380, 9014.52] - - [704, 5888, 1, 128] - - [233, 4221.77] + - [367, 4221.77] - - [6784, 3584, 1, 128] - - [231, 5360.73] + - [365, 5360.73] - - [1024, 64, 1, 256] - - [191, 1588.85] + - [325, 1588.85] - - [64, 2368, 1, 256] - - [246, 2552.55] + - [380, 2552.55] - - [4288, 5056, 1, 3328] - - [245, 8193.38] + - [379, 8193.38] - - [4, 1856, 1, 1280] - - [195, 499.192] + - [329, 499.192] - - [4288, 128, 1, 128] - - [232, 2373.57] + - [366, 2373.57] - - [1408, 1408, 1, 128] - - [235, 3753.88] + - [369, 3753.88] - - [1024, 128, 1, 3328] - - [217, 5656.32] + - [351, 5656.32] - - [1856, 128, 1, 128] - - [168, 1617.58] + - [302, 1617.58] - - [5056, 2368, 1, 256] - - [251, 5553.41] + - [385, 5553.41] - - [4288, 704, 1, 3328] - - [239, 6962.06] + - [373, 6962.06] - - [448, 3584, 1, 256] - - [249, 5981.5] + - [383, 5981.5] - - [64, 128, 1, 128] - - [186, 74.9983] + - [320, 74.9983] - - [2368, 64, 1, 1280] - - [222, 5041.33] + - [356, 5041.33] - - [2368, 1024, 1, 1280] - - [247, 7740.97] + - [381, 7740.97] - - [2944, 1408, 1, 3328] - - [249, 9204.65] + - [383, 9204.65] - - [1408, 448, 1, 256] - - [246, 5954.4] + - [380, 5954.4] - - [1024, 1408, 1, 3328] - - [243, 8161.54] + - [377, 8161.54] - - [2560, 7133, 1, 2560] - - [238, 9636.69] + - [372, 9636.69] - - [1408, 4, 1, 1280] - - [190, 520.979] + - [324, 520.979] - - [5888, 3584, 1, 256] - - [251, 9225.26] + - [385, 9225.26] - - [128, 1024, 1, 1280] - - [193, 4755.55] + - [327, 4755.55] - - [1408, 1856, 1, 3328] - - [243, 9130.87] + - [377, 9130.87] - - [4, 4, 1, 3328] - - [263, 7.03333] + - [397, 7.03333] - - [6784, 1408, 1, 1280] - - [240, 9346.91] + - [374, 9346.91] - - [4, 1024, 1, 1280] - - [190, 422.913] + - [324, 422.913] - - [704, 2944, 1, 256] - - [246, 8332.06] + - [380, 8332.06] - - [704, 4288, 1, 128] - - [232, 4371.14] + - [366, 4371.14] - - [2368, 4288, 1, 128] - - [232, 3988.89] + - [366, 3988.89] - - [64, 4288, 1, 1280] - - [222, 5407.63] + - [356, 5407.63] - - [6784, 64, 1, 1280] - - [202, 5708.25] + - [336, 5708.25] - - [3584, 128, 1, 128] - - [168, 2463.2] + - [302, 2463.2] - - [1024, 6784, 1, 128] - - [233, 3862.12] + - [367, 3862.12] - - [4, 1856, 1, 128] - - [253, 30.6362] + - [387, 30.6362] - - [1408, 64, 1, 3328] - - [222, 6095.48] + - [356, 6095.48] - - [6784, 4, 1, 256] - - [255, 487.938] + - [389, 487.938] - - [1408, 1408, 1, 1280] - - [251, 8640.63] + - [385, 8640.63] - - [256, 2368, 1, 256] - - [243, 4282.36] + - [377, 4282.36] - - [448, 4288, 1, 3328] - - [240, 8516.13] + - [374, 8516.13] - - [2368, 1408, 1, 256] - - [246, 8632.19] + - [380, 8632.19] - - [5888, 5056, 1, 128] - - [232, 5091.11] + - [366, 5091.11] - - [704, 2368, 1, 256] - - [246, 7664.8] + - [380, 7664.8] - - [2944, 448, 1, 1280] - - [246, 7618.35] + - [380, 7618.35] - - [5888, 2368, 1, 3328] - - [249, 9343.48] + - [383, 9343.48] - - [64, 2944, 1, 1280] - - [214, 5162.18] + - [348, 5162.18] - - [448, 1856, 1, 1280] - - [240, 7028.0] + - [374, 7028.0] - - [4288, 448, 1, 1280] - - [240, 5855.76] + - [374, 5855.76] - - [5888, 704, 1, 3328] - - [249, 9190.91] + - [383, 9190.91] - - [5056, 256, 1, 128] - - [235, 3235.94] + - [369, 3235.94] - - [1856, 256, 1, 128] - - [233, 1849.78] + - [367, 1849.78] - - [5056, 128, 1, 256] - - [246, 6109.06] + - [380, 6109.06] - - [704, 4, 1, 256] - - [206, 125.256] + - [340, 125.256] - - [1408, 5888, 1, 128] - - [232, 5055.16] + - [366, 5055.16] - - [4288, 4, 1, 128] - - [252, 95.7209] + - [386, 95.7209] - - [1408, 1024, 1, 256] - - [240, 7370.28] + - [374, 7370.28] - - [1024, 1856, 1, 128] - - [232, 2966.8] + - [366, 2966.8] - - [256, 704, 1, 128] - - [234, 528.229] + - [368, 528.229] - - [256, 1024, 1, 128] - - [232, 1171.69] + - [366, 1171.69] - - [448, 1024, 1, 256] - - [246, 5624.65] + - [380, 5624.65] - - [128, 4, 1, 3328] - - [263, 191.985] + - [397, 191.985] - - [5056, 6784, 1, 1280] - - [240, 9544.07] + - [374, 9544.07] - - [704, 5056, 1, 3328] - - [247, 8790.35] + - [381, 8790.35] - - [64, 1408, 1, 1280] - - [214, 4505.7] + - [348, 4505.7] - - [3584, 5056, 1, 3328] - - [245, 9073.52] + - [379, 9073.52] - - [1856, 4, 1, 3328] - - [263, 612.875] + - [397, 612.875] - - [4, 2944, 1, 128] - - [252, 72.0145] + - [386, 72.0145] - - [2368, 2944, 1, 3328] - - [238, 9314.68] + - [372, 9314.68] - - [448, 448, 1, 1280] - - [222, 5129.91] + - [356, 5129.91] - - [2368, 3584, 1, 256] - - [240, 8998.8] - - - [1024, 256, 1, 1280] - - [247, 3566.68] + - [374, 8998.8] - - [5056, 3584, 1, 1280] - - [241, 9345.17] + - [375, 9345.17] - - [448, 4, 1, 3328] - - [263, 487.337] + - [397, 487.337] - - [1856, 2944, 1, 1280] - - [251, 8438.79] + - [385, 8438.79] - - [3584, 2368, 1, 1280] - - [246, 9298.9] + - [380, 9298.9] - - [128, 1024, 1, 256] - - [198, 2356.45] + - [332, 2356.45] - - [2944, 1408, 1, 256] - - [238, 5440.82] + - [372, 5440.82] - - [4288, 1408, 1, 3328] - - [238, 9386.09] + - [372, 9386.09] - - [3584, 64, 1, 3328] - - [194, 6310.97] + - [328, 6310.97] - - [1408, 128, 1, 256] - - [240, 2942.53] + - [374, 2942.53] - - [2944, 1024, 1, 128] - - [235, 3927.99] + - [369, 3927.99] - - [4288, 5056, 1, 1280] - - [242, 8328.58] + - [376, 8328.58] - - [5888, 6784, 1, 1280] - - [251, 9757.44] + - [385, 9757.44] - - [6784, 5056, 1, 128] - - [231, 5101.4] + - [365, 5101.4] - - [256, 1024, 1, 3328] - - [240, 6475.87] + - [374, 6475.87] - - [3584, 4, 1, 256] - - [256, 420.973] + - [390, 420.973] - - [1856, 64, 1, 3328] - - [222, 6409.2] + - [356, 6409.2] - - [64, 6784, 1, 128] - - [170, 2387.32] + - [304, 2387.32] - - [5888, 1408, 1, 3328] - - [245, 9655.89] + - [379, 9655.89] - - [5888, 64, 1, 1280] - - [240, 5870.86] + - [374, 5870.86] - - [256, 5056, 1, 256] - - [243, 6109.06] + - [377, 6109.06] - - [128, 3584, 1, 128] - - [173, 2383.23] + - [307, 2383.23] - - [448, 3584, 1, 3328] - - [238, 7092.28] + - [372, 7092.28] - - [704, 2368, 1, 128] - - [232, 3741.08] + - [366, 3741.08] - - [5888, 256, 1, 128] - - [233, 2977.54] + - [367, 2977.54] - - [4, 5056, 1, 128] - - [252, 132.72] + - [386, 132.72] - - [448, 256, 1, 256] - - [204, 2308.29] + - [338, 2308.29] - - [704, 4, 1, 3328] - - [209, 552.674] + - [343, 552.674] - - [1408, 256, 1, 256] - - [240, 4577.22] + - [374, 4577.22] - - [3584, 1856, 1, 128] - - [232, 4571.86] + - [366, 4571.86] - - [4288, 4288, 1, 128] - - [235, 5284.65] + - [369, 5284.65] - - [1856, 1024, 1, 3328] - - [246, 6362.25] + - [380, 6362.25] - - [128, 5888, 1, 3328] - - [216, 7040.83] + - [350, 7040.83] - - [1024, 5056, 1, 256] - - [251, 7855.7] + - [385, 7855.7] - - [2368, 1408, 1, 3328] - - [246, 9205.66] + - [380, 9205.66] - - [5888, 448, 1, 256] - - [243, 5538.84] + - [377, 5538.84] - - [5888, 6784, 1, 128] - - [231, 4500.85] + - [365, 4500.85] - - [2368, 4, 1, 3328] - - [209, 642.898] + - [343, 642.898] - - [6784, 5056, 1, 1280] - - [247, 9249.23] + - [381, 9249.23] - - [5056, 704, 1, 1280] - - [246, 8883.37] + - [380, 8883.37] - - [1408, 256, 1, 1280] - - [240, 5632.1] + - [374, 5632.1] - - [4288, 6784, 1, 1280] - - [246, 8843.31] + - [380, 8843.31] - - [128, 704, 1, 256] - - [204, 2045.19] + - [338, 2045.19] - - [448, 128, 1, 1280] - - [214, 3807.17] + - [348, 3807.17] - - [6784, 4, 1, 3328] - - [257, 684.671] + - [391, 684.671] - - [4288, 4, 1, 1280] - - [206, 601.925] + - [340, 601.925] - - [1024, 64, 1, 3328] - - [218, 3928.48] + - [352, 3928.48] - - [1856, 4, 1, 256] - - [256, 293.394] + - [390, 293.394] - - [64, 3584, 1, 1280] - - [240, 5265.55] + - [374, 5265.55] - - [6784, 1408, 1, 256] - - [240, 9059.36] + - [374, 9059.36] - - [3584, 5888, 1, 128] - - [232, 5084.29] + - [366, 5084.29] - - [5056, 5888, 1, 256] - - [251, 8590.09] + - [385, 8590.09] - - [2368, 1024, 1, 256] - - [243, 4493.13] + - [377, 4493.13] - - [2944, 1856, 1, 256] - - [249, 5202.41] + - [383, 5202.41] - - [1856, 6784, 1, 1280] - - [247, 9071.48] + - [381, 9071.48] - - [64, 5056, 1, 128] - - [170, 2038.42] + - [304, 2038.42] - - [5888, 64, 1, 128] - - [169, 2016.59] + - [303, 2016.59] - - [448, 704, 1, 128] - - [233, 1173.65] + - [367, 1173.65] - - [4, 1024, 1, 128] - - [252, 8.89685] + - [386, 8.89685] - - [4288, 3584, 1, 256] - - [246, 9080.26] + - [380, 9080.26] - - [1408, 704, 1, 128] - - [232, 3165.71] + - [366, 3165.71] - - [64, 256, 1, 3328] - - [226, 3126.59] + - [360, 3126.59] - - [5056, 1856, 1, 1280] - - [243, 8857.55] + - [377, 8857.55] - - [1408, 1024, 1, 3328] - - [249, 8177.12] + - [383, 8177.12] - - [2368, 256, 1, 3328] - - [240, 6810.31] + - [374, 6810.31] - - [5888, 3584, 1, 1280] - - [238, 9535.55] + - [372, 9535.55] - - [1856, 3584, 1, 3328] - - [240, 9281.91] + - [374, 9281.91] - - [5888, 128, 1, 1280] - - [246, 8136.82] + - [380, 8136.82] - - [1024, 2944, 1, 256] - - [238, 7247.96] + - [372, 7247.96] - - [448, 6784, 1, 1280] - - [246, 7014.04] + - [380, 7014.04] - - [256, 3584, 1, 1280] - - [240, 7738.64] + - [374, 7738.64] - - [448, 128, 1, 128] - - [170, 496.048] + - [304, 496.048] - - [704, 5056, 1, 256] - - [246, 8609.44] + - [380, 8609.44] - - [3584, 1024, 1, 3328] - - [239, 7765.73] + - [373, 7765.73] - - [2944, 1856, 1, 1280] - - [251, 7776.03] + - [385, 7776.03] - - [128, 256, 1, 128] - - [183, 296.308] + - [317, 296.308] - - [5056, 256, 1, 256] - - [240, 7829.73] + - [374, 7829.73] - - [2368, 3584, 1, 3328] - - [239, 8896.08] + - [373, 8896.08] - - [2944, 704, 1, 1280] - - [249, 6855.83] + - [383, 6855.83] - - [128, 4, 1, 256] - - [258, 24.9242] + - [392, 24.9242] - - [2944, 3584, 1, 1280] - - [251, 9049.22] + - [385, 9049.22] - - [1856, 5888, 1, 1280] - - [246, 9432.06] + - [380, 9432.06] - - [256, 256, 1, 1280] - - [211, 3942.12] + - [345, 3942.12] - - [5056, 448, 1, 3328] - - [251, 4587.83] + - [385, 4587.83] - - [4288, 1408, 1, 256] - - [251, 5408.83] + - [385, 5408.83] - - [3584, 64, 1, 256] - - [220, 2496.71] + - [354, 2496.71] - - [64, 1856, 1, 3328] - - [193, 5896.78] + - [327, 5896.78] - - [256, 1408, 1, 128] - - [232, 1643.17] + - [366, 1643.17] - - [5888, 1408, 1, 128] - - [231, 4436.37] + - [365, 4436.37] - - [4288, 2368, 1, 1280] - - [240, 9433.04] + - [374, 9433.04] - - [4, 4288, 1, 256] - - [255, 442.732] + - [389, 442.732] - - [256, 4288, 1, 128] - - [232, 2814.79] + - [366, 2814.79] - - [256, 128, 1, 3328] - - [221, 3951.26] + - [355, 3951.26] - - [6784, 2368, 1, 256] - - [240, 9169.99] + - [374, 9169.99] - - [5888, 128, 1, 128] - - [169, 3156.81] + - [303, 3156.81] - - [4288, 1856, 1, 256] - - [246, 5658.23] + - [380, 5658.23] - - [1856, 256, 1, 3328] - - [240, 7646.37] + - [374, 7646.37] - - [1856, 2944, 1, 256] - - [247, 6444.98] + - [381, 6444.98] - - [5056, 1024, 1, 128] - - [231, 4607.3] + - [365, 4607.3] - - [64, 5888, 1, 1280] - - [246, 5842.46] + - [380, 5842.46] - - [1760, 7133, 1, 1760] - - [239, 9097.84] + - [373, 9097.84] - - [6784, 256, 1, 128] - - [232, 3685.41] + - [366, 3685.41] - - [5888, 704, 1, 128] - - [231, 3656.23] + - [365, 3656.23] - - [6784, 64, 1, 128] - - [182, 2191.52] + - [316, 2191.52] - - [1024, 4288, 1, 1280] - - [246, 9199.32] + - [380, 9199.32] - - [2368, 5056, 1, 3328] - - [242, 9072.88] + - [376, 9072.88] - - [448, 4, 1, 128] - - [253, 5.42937] + - [387, 5.42937] - - [4, 256, 1, 3328] - - [263, 311.037] + - [397, 311.037] - - [4288, 1024, 1, 3328] - - [244, 8660.33] + - [378, 8660.33] - - [1024, 5056, 1, 3328] - - [240, 8886.76] + - [374, 8886.76] - - [1024, 1856, 1, 3328] - - [245, 8426.24] + - [379, 8426.24] - - [704, 704, 1, 1280] - - [240, 7661.8] + - [374, 7661.8] - - [128, 2368, 1, 1280] - - [214, 5746.15] + - [348, 5746.15] - - [1408, 128, 1, 3328] - - [222, 6530.87] + - [356, 6530.87] - - [3584, 256, 1, 1280] - - [246, 7634.04] + - [380, 7634.04] - - [4, 128, 1, 128] - - [253, 2.07874] + - [387, 2.07874] - - [704, 6784, 1, 128] - - [235, 4589.59] + - [369, 4589.59] - - [3584, 128, 1, 1280] - - [240, 7078.24] + - [374, 7078.24] - - [4, 256, 1, 1280] - - [209, 178.187] + - [343, 178.187] - - [128, 704, 1, 3328] - - [214, 5959.81] + - [348, 5959.81] - - [4288, 6784, 1, 256] - - [240, 9326.54] + - [374, 9326.54] - - [3584, 2944, 1, 3328] - - [242, 9114.16] + - [376, 9114.16] - - [128, 1856, 1, 256] - - [246, 3672.65] + - [380, 3672.65] - - [64, 4288, 1, 256] - - [240, 3457.51] + - [374, 3457.51] - - [4, 3584, 1, 3328] - - [189, 694.37] + - [323, 694.37] - - [64, 4, 1, 3328] - - [209, 71.5738] + - [343, 71.5738] - - [4, 64, 1, 3328] - - [209, 91.9069] + - [343, 91.9069] - - [5888, 2944, 1, 256] - - [239, 7241.55] + - [373, 7241.55] - - [2368, 6784, 1, 128] - - [235, 5229.63] + - [369, 5229.63] - - [448, 4288, 1, 1280] - - [240, 8416.4] + - [374, 8416.4] - - [448, 1856, 1, 3328] - - [240, 7161.56] + - [374, 7161.56] - - [4, 1024, 1, 256] - - [206, 187.346] + - [340, 187.346] - - [5056, 4288, 1, 256] - - [251, 8947.26] + - [385, 8947.26] - - [1024, 448, 1, 256] - - [246, 5318.96] + - [380, 5318.96] - - [1024, 3584, 1, 256] - - [241, 6152.04] + - [375, 6152.04] - - [2944, 128, 1, 1280] - - [222, 6053.63] + - [356, 6053.63] - - [1856, 5056, 1, 128] - - [232, 5091.42] + - [366, 5091.42] - - [64, 256, 1, 256] - - [195, 771.112] + - [329, 771.112] - - [1408, 4, 1, 128] - - [252, 40.8758] + - [386, 40.8758] - - [128, 2368, 1, 128] - - [180, 1520.37] + - [314, 1520.37] - - [256, 704, 1, 1280] - - [240, 4329.81] + - [374, 4329.81] - - [64, 2368, 1, 128] - - [171, 1212.52] + - [305, 1212.52] - - [6784, 6784, 1, 3328] - - [251, 8310.67] + - [385, 8310.67] - - [448, 5888, 1, 1280] - - [246, 8502.33] + - [380, 8502.33] - - [5056, 448, 1, 128] - - [232, 4161.0] + - [366, 4161.0] - - [3584, 2944, 1, 128] - - [232, 4363.51] + - [366, 4363.51] - - [6784, 256, 1, 1280] - - [246, 8629.67] + - [380, 8629.67] - - [256, 2944, 1, 1280] - - [246, 7277.48] + - [380, 7277.48] - - [64, 4288, 1, 128] - - [171, 1822.06] + - [305, 1822.06] - - [2368, 5888, 1, 3328] - - [240, 9017.52] + - [374, 9017.52] - - [4, 64, 1, 256] - - [206, 16.1627] + - [340, 16.1627] - - [704, 1024, 1, 3328] - - [246, 8059.55] + - [380, 8059.55] - - [2368, 1856, 1, 1280] - - [246, 8813.24] + - [380, 8813.24] - - [128, 448, 1, 128] - - [167, 588.244] + - [301, 588.244] - - [128, 6784, 1, 256] - - [246, 6538.28] + - [380, 6538.28] - - [3584, 4288, 1, 128] - - [232, 5025.46] + - [366, 5025.46] - - [64, 448, 1, 128] - - [184, 231.793] + - [318, 231.793] - - [5888, 4288, 1, 3328] - - [240, 9515.88] + - [374, 9515.88] - - [2368, 704, 1, 256] - - [246, 7642.84] + - [380, 7642.84] - - [256, 1856, 1, 3328] - - [246, 6547.17] + - [380, 6547.17] - - [1856, 128, 1, 256] - - [240, 3782.28] + - [374, 3782.28] - - [6784, 128, 1, 128] - - [174, 2835.54] + - [308, 2835.54] - - [3584, 1408, 1, 128] - - [231, 3049.21] + - [365, 3049.21] - - [1856, 5056, 1, 1280] - - [247, 8863.3] + - [381, 8863.3] - - [2944, 1024, 1, 1280] - - [251, 8873.25] + - [385, 8873.25] - - [5056, 4, 1, 256] - - [187, 494.121] + - [321, 494.121] - - [3584, 5888, 1, 3328] - - [239, 9585.25] + - [373, 9585.25] - - [2368, 4288, 1, 256] - - [251, 6419.05] + - [385, 6419.05] - - [1024, 2368, 1, 3328] - - [246, 8645.36] + - [380, 8645.36] - - [64, 704, 1, 3328] - - [228, 4399.93] + - [362, 4399.93] - - [704, 1408, 1, 256] - - [240, 7428.54] + - [374, 7428.54] - - [6784, 1856, 1, 3328] - - [251, 9163.66] + - [385, 9163.66] - - [1024, 2944, 1, 128] - - [235, 3551.98] + - [369, 3551.98] - - [1024, 3584, 1, 1280] - - [249, 9112.47] + - [383, 9112.47] - - [4288, 5888, 1, 3328] - - [239, 8524.05] + - [373, 8524.05] - - [4288, 4, 1, 3328] - - [206, 620.016] + - [340, 620.016] - - [256, 1408, 1, 256] - - [240, 4505.7] + - [374, 4505.7] - - [448, 2944, 1, 1280] - - [240, 7612.87] + - [374, 7612.87] - - [4, 5888, 1, 128] - - [252, 174.564] + - [386, 174.564] - - [1024, 2944, 1, 3328] - - [245, 9136.74] + - [379, 9136.74] - - [3584, 6784, 1, 256] - - [245, 7253.89] + - [379, 7253.89] - - [256, 6784, 1, 1280] - - [240, 8637.72] + - [374, 8637.72] - - [1856, 3584, 1, 256] - - [246, 8199.67] + - [380, 8199.67] - - [128, 448, 1, 3328] - - [227, 4799.92] + - [361, 4799.92] - - [6784, 1856, 1, 128] - - [232, 5185.62] + - [366, 5185.62] - - [4, 448, 1, 256] - - [206, 86.9848] + - [340, 86.9848] - - [2944, 704, 1, 128] - - [235, 3798.64] + - [369, 3798.64] - - [256, 5888, 1, 1280] - - [240, 8678.47] + - [374, 8678.47] - - [4, 128, 1, 1280] - - [209, 102.5] + - [343, 102.5] - - [4288, 6784, 1, 3328] - - [245, 8209.4] + - [379, 8209.4] - - [6784, 128, 1, 1280] - - [222, 6562.99] + - [356, 6562.99] - - [64, 1408, 1, 256] - - [212, 2059.8] + - [346, 2059.8] - - [7680, 5481, 1, 2560] - - [251, 9426.79] + - [385, 9426.79] - - [2368, 1408, 1, 128] - - [232, 4532.5] + - [366, 4532.5] - - [1856, 448, 1, 256] - - [240, 6275.48] + - [374, 6275.48] - - [1408, 1024, 1, 128] - - [232, 3604.58] + - [366, 3604.58] - - [128, 64, 1, 128] - - [167, 87.4813] + - [301, 87.4813] - - [6784, 3584, 1, 3328] - - [247, 8991.92] + - [381, 8991.92] - - [2944, 64, 1, 3328] - - [216, 6043.36] + - [350, 6043.36] - - [64, 64, 1, 128] - - [172, 36.309] + - [306, 36.309] - - [2368, 5056, 1, 1280] - - [246, 9438.48] + - [380, 9438.48] - - [64, 4, 1, 1280] - - [209, 40.2569] + - [343, 40.2569] - - [1408, 2368, 1, 1280] - - [242, 7738.16] + - [376, 7738.16] - - [128, 1408, 1, 1280] - - [214, 4937.74] + - [348, 4937.74] - - [256, 64, 1, 3328] - - [224, 2683.46] + - [358, 2683.46] - - [2944, 4288, 1, 128] - - [232, 5173.81] + - [366, 5173.81] - - [2944, 2944, 1, 256] - - [240, 8943.92] + - [374, 8943.92] - - [2944, 4, 1, 1280] - - [189, 617.857] + - [323, 617.857] - - [5888, 4, 1, 256] - - [255, 483.218] + - [389, 483.218] - - [6784, 256, 1, 256] - - [246, 7916.7] + - [380, 7916.7] - - [256, 5056, 1, 3328] - - [240, 8953.25] + - [374, 8953.25] - - [128, 4288, 1, 1280] - - [193, 6015.05] + - [327, 6015.05] - - [5056, 1856, 1, 128] - - [234, 4221.15] + - [368, 4221.15] - - [5888, 1408, 1, 256] - - [245, 9144.85] + - [379, 9144.85] - - [128, 128, 1, 256] - - [195, 759.938] + - [329, 759.938] - - [5056, 4, 1, 3328] - - [255, 642.818] + - [389, 642.818] - - [4288, 3584, 1, 3328] - - [241, 9300.05] + - [375, 9300.05] - - [448, 704, 1, 3328] - - [247, 4481.08] + - [381, 4481.08] - - [448, 448, 1, 128] - - [171, 1360.81] + - [305, 1360.81] - - [1024, 2368, 1, 1280] - - [240, 8570.29] + - [374, 8570.29] - - [1856, 704, 1, 3328] - - [240, 8448.26] + - [374, 8448.26] - - [4, 2368, 1, 128] - - [252, 64.5902] + - [386, 64.5902] - - [5888, 6784, 1, 3328] - - [247, 9447.12] + - [381, 9447.12] - - [704, 4288, 1, 1280] - - [249, 7476.87] + - [383, 7476.87] - - [704, 256, 1, 256] - - [240, 2957.62] + - [374, 2957.62] - - [6784, 448, 1, 3328] - - [243, 8886.22] + - [377, 8886.22] - - [4288, 1024, 1, 128] - - [231, 3864.49] + - [365, 3864.49] - - [49, 512, 128, 2048] - - [274, 7112.78] + - [408, 7112.78] - - [196, 256, 256, 1024] - - [268, 8302.7] + - [402, 8302.7] - - [784, 512, 256, 128] - - [266, 9061.36] + - [400, 9061.36] - - [49, 2048, 128, 512] - - [264, 6963.36] + - [398, 6963.36] - - [784, 512, 64, 128] - - [266, 8822.62] + - [400, 8822.62] - - [784, 128, 128, 512] - - [273, 8983.63] + - [407, 8983.63] - - [196, 256, 64, 1024] - - [272, 7823.5] + - [406, 7823.5] - - [3136, 256, 256, 64] - - [269, 9051.38] + - [403, 9051.38] - - [3136, 64, 128, 64] - - [265, 8581.35] + - [399, 8581.35] - - [49, 2048, 256, 512] - - [264, 7049.64] + - [398, 7049.64] - - [196, 1024, 64, 256] - - [267, 7953.69] + - [401, 7953.69] - - [784, 128, 256, 512] - - [275, 9102.99] + - [409, 9102.99] - - [196, 256, 128, 1024] - - [267, 8085.89] + - [401, 8085.89] - - [3136, 64, 64, 256] - - [271, 9266.13] + - [405, 9266.13] - - [784, 128, 64, 512] - - [272, 8809.39] + - [406, 8809.39] - - [49, 2048, 64, 512] - - [264, 6843.95] + - [398, 6843.95] - - [3136, 64, 128, 256] - - [271, 9381.39] + - [405, 9381.39] - - [3136, 256, 128, 64] - - [269, 8982.64] + - [403, 8982.64] - - [784, 512, 128, 128] - - [266, 8965.99] + - [400, 8965.99] - - [3136, 256, 64, 64] - - [269, 8879.8] + - [403, 8879.8] - - [3136, 64, 256, 256] - - [271, 9566.43] + - [405, 9566.43] - - [3136, 64, 64, 64] - - [270, 8314.05] + - [404, 8314.05] - - [3136, 64, 256, 64] - - [265, 8743.8] + - [399, 8743.8] - - [196, 1024, 128, 256] - - [268, 8119.43] + - [402, 8119.43] - - [49, 512, 64, 2048] - - [276, 7055.41] + - [410, 7055.41] - - [49, 512, 256, 2048] - - [277, 7166.41] + - [411, 7166.41] - - [196, 1024, 256, 256] - - [268, 8210.66] + - [402, 8210.66] - - [5329, 160, 64, 64] - - [284, 8156.89] + - [418, 8156.89] - - [1225, 288, 64, 48] - - [288, 6926.23] + - [422, 6926.23] - - [1225, 192, 64, 64] - - [290, 7840.1] + - [424, 7840.1] - - [64, 1280, 64, 384] - - [291, 9276.11] + - [425, 9276.11] - - [1225, 384, 64, 192] - - [281, 9162.35] + - [415, 9162.35] - - [1225, 288, 64, 64] - - [282, 7495.27] + - [416, 7495.27] - - [5329, 64, 64, 80] - - [283, 8480.13] + - [417, 8480.13] - - [289, 1024, 64, 256] - - [281, 8483.83] + - [415, 8483.83] - - [289, 768, 64, 192] - - [287, 8234.84] + - [421, 8234.84] - - [289, 768, 64, 128] - - [287, 7988.81] + - [421, 7988.81] - - [64, 1536, 64, 384] - - [291, 9323.65] + - [425, 9323.65] - - [1225, 384, 64, 64] - - [290, 8158.8] + - [424, 8158.8] - - [64, 2048, 64, 192] - - [287, 8818.61] + - [421, 8818.61] - - [64, 1280, 64, 320] - - [283, 9202.17] + - [417, 9202.17] - - [1225, 384, 64, 96] - - [281, 8540.7] + - [415, 8540.7] - - [64, 1280, 64, 448] - - [287, 9317.82] + - [421, 9317.82] - - [289, 768, 64, 160] - - [291, 8128.81] + - [425, 8128.81] - - [1225, 192, 64, 32] - - [290, 6495.37] + - [424, 6495.37] - - [64, 1536, 64, 256] - - [287, 9143.0] + - [421, 9143.0] - - [1225, 256, 64, 48] - - [285, 7545.36] + - [419, 7545.36] - - [1225, 256, 64, 64] - - [286, 7972.45] + - [420, 7972.45] - - [1225, 192, 64, 48] - - [289, 7348.9] + - [423, 7348.9] - - [289, 1024, 64, 384] - - [279, 8725.66] + - [413, 8725.66] - - [289, 1024, 64, 192] - - [281, 8313.16] + - [415, 8313.16] - - [64, 1280, 64, 192] - - [283, 8768.68] + - [417, 8768.68] - - [64, 2048, 64, 320] - - [280, 9147.98] + - [414, 9147.98] - - [64, 2048, 64, 448] - - [278, 9304.16] + - [412, 9304.16] - - [64, 2048, 64, 384] - - [280, 9235.28] + - [414, 9235.28] - - [289, 1024, 64, 128] - - [287, 7989.51] + - [421, 7989.51] - - [4096, 1024, 1, 2984] - - [326, 9846.39] + - [460, 9846.39] - - [1024, 4096, 1, 3437] - - [327, 9915.8] + - [461, 9915.8] - - [1024, 4096, 1, 3235] - - [320, 9914.02] + - [454, 9914.02] - - [4096, 1024, 1, 4032] - - [326, 9926.06] + - [460, 9926.06] - - [1024, 4096, 1, 3334] - - [327, 9918.27] + - [461, 9918.27] - - [4096, 1024, 1, 3288] - - [327, 9854.67] + - [461, 9854.67] - - [1024, 4096, 1, 3515] - - [327, 9924.03] + - [461, 9924.03] - - [4096, 1024, 1, 3437] - - [327, 9869.63] + - [461, 9869.63] - - [1024, 4096, 1, 3259] - - [327, 9907.65] + - [461, 9907.65] - - [1024, 4096, 1, 3384] - - [319, 9921.21] + - [453, 9921.21] - - [64, 92, 688, 92] - - [297, 6137.89] + - [431, 6137.89] - - [4096, 1024, 1, 3458] - - [326, 9887.69] + - [460, 9887.69] - - [1024, 4096, 1, 3412] - - [326, 9930.56] + - [460, 9930.56] - - [1024, 4096, 1, 3529] - - [320, 9924.54] + - [454, 9924.54] - - [1024, 4096, 1, 4032] - - [327, 9963.48] + - [461, 9963.48] - - [4096, 1024, 1, 3999] - - [327, 9895.0] + - [461, 9895.0] - - [1024, 4096, 1, 3079] - - [320, 9894.58] + - [454, 9894.58] - - [1024, 4096, 1, 3876] - - [319, 9949.39] + - [453, 9949.39] - - [1024, 4096, 1, 3450] - - [327, 9915.65] + - [461, 9915.65] - - [1024, 4096, 1, 3256] - - [327, 9911.18] + - [461, 9911.18] - - [4096, 1024, 1, 3403] - - [326, 9858.93] + - [460, 9858.93] - - [1024, 1024, 1, 3975] - - [317, 8990.81] + - [451, 8990.81] - - [1024, 4096, 1, 3359] - - [327, 9915.0] + - [461, 9915.0] - - [4096, 1024, 1, 3549] - - [326, 9870.66] + - [460, 9870.66] - - [4096, 1024, 1, 3176] - - [327, 9855.92] + - [461, 9855.92] - - [1024, 4096, 1, 3504] - - [319, 9934.17] + - [453, 9934.17] - - [4096, 1024, 1, 3314] - - [326, 9873.9] + - [460, 9873.9] - - [4096, 1024, 1, 3183] - - [326, 9843.84] + - [460, 9843.84] - - [1024, 4096, 1, 3209] - - [320, 9904.97] + - [454, 9904.97] - - [1024, 4096, 1, 3720] - - [319, 9934.16] + - [453, 9934.16] - - [1024, 4096, 1, 3859] - - [319, 9952.53] + - [453, 9952.53] - - [1024, 33708, 1, 4059] - - [319, 10321.5] + - [453, 10321.5] - - [1024, 4096, 1, 3968] - - [319, 9955.96] + - [453, 9955.96] - - [64, 123, 528, 123] - - [292, 6916.21] + - [426, 6916.21] - - [4096, 1024, 1, 3477] - - [327, 9872.03] + - [461, 9872.03] - - [4096, 1024, 1, 3233] - - [327, 9862.35] + - [461, 9862.35] - - [4096, 1024, 1, 3409] - - [327, 9876.86] + - [461, 9876.86] - - [4096, 1024, 1, 3564] - - [327, 9870.49] + - [461, 9870.49] - - [64, 102, 624, 100] - - [292, 5773.16] + - [426, 5773.16] - - [4096, 1024, 1, 3190] - - [326, 9850.97] + - [460, 9850.97] - - [64, 112, 576, 111] - - [292, 6517.35] + - [426, 6517.35] - - [1024, 4096, 1, 3288] - - [326, 9911.9] + - [460, 9911.9] - - [4096, 1024, 1, 3451] - - [326, 9859.61] + - [460, 9859.61] - - [1024, 4096, 1, 3348] - - [319, 9915.47] + - [453, 9915.47] - - [64, 102, 624, 102] - - [292, 5783.7] + - [426, 5783.7] - - [1024, 4096, 1, 3465] - - [320, 9913.12] + - [454, 9913.12] - - [1024, 33708, 1, 4032] - - [319, 10340.4] + - [453, 10340.4] - - [1024, 33708, 1, 3840] - - [319, 10341.8] + - [453, 10341.8] - - [4096, 1024, 1, 3391] - - [327, 9861.77] + - [461, 9861.77] - - [1024, 4096, 1, 3530] - - [319, 9920.44] + - [453, 9920.44] - - [4096, 1024, 1, 3209] - - [326, 9847.0] + - [460, 9847.0] - - [1024, 4096, 1, 3457] - - [320, 9917.29] + - [454, 9917.29] - - [1024, 4096, 1, 3386] - - [319, 9917.65] + - [453, 9917.65] - - [4096, 1024, 1, 3350] - - [326, 9884.54] + - [460, 9884.54] - - [1024, 4096, 1, 3184] - - [327, 9925.98] + - [461, 9925.98] - - [1024, 4096, 1, 3093] - - [326, 9902.55] + - [460, 9902.55] - - [64, 133, 480, 135] - - [309, 6205.97] + - [443, 6205.97] - - [1024, 4096, 1, 3400] - - [319, 9917.1] + - [453, 9917.1] - - [1024, 1024, 1, 4026] - - [325, 9014.39] + - [459, 9014.39] - - [1024, 4096, 1, 3214] - - [319, 9895.94] + - [453, 9895.94] - - [4096, 1024, 1, 3406] - - [327, 9857.82] + - [461, 9857.82] - - [1024, 4096, 1, 3565] - - [326, 9919.37] + - [460, 9919.37] - - [4096, 1024, 1, 3536] - - [327, 9889.06] + - [461, 9889.06] - - [1024, 4096, 1, 3183] - - [326, 9907.55] + - [460, 9907.55] - - [1024, 4096, 1, 3462] - - [327, 9922.4] + - [461, 9922.4] - - [4096, 1024, 1, 3130] - - [320, 9846.04] + - [454, 9846.04] - - [4096, 1024, 1, 3381] - - [327, 9868.27] + - [461, 9868.27] - - [4096, 1024, 1, 3298] - - [326, 9870.54] + - [460, 9870.54] - - [1024, 4096, 1, 3292] - - [319, 9906.3] + - [453, 9906.3] - - [4096, 1024, 1, 3289] - - [326, 9856.55] + - [460, 9856.55] - - [64, 160, 400, 159] - - [312, 7427.84] + - [446, 7427.84] - - [1024, 4096, 1, 3379] - - [319, 9917.09] + - [453, 9917.09] - - [1024, 4096, 1, 3990] - - [320, 9947.37] + - [454, 9947.37] - - [1024, 4096, 1, 3540] - - [327, 9935.76] + - [461, 9935.76] - - [4096, 1024, 1, 3412] - - [327, 9867.56] + - [461, 9867.56] - - [1024, 1024, 1, 3780] - - [322, 9036.26] + - [456, 9036.26] - - [1024, 4096, 1, 3555] - - [326, 9927.37] + - [460, 9927.37] - - [1024, 4096, 1, 3518] - - [320, 9925.55] + - [454, 9925.55] - - [4096, 1024, 1, 3189] - - [326, 9861.24] + - [460, 9861.24] - - [1024, 4096, 1, 3298] - - [320, 9923.22] + - [454, 9923.22] - - [4096, 1024, 1, 3072] - - [326, 9872.08] + - [460, 9872.08] - - [1024, 4096, 1, 3393] - - [327, 9929.28] + - [461, 9929.28] - - [1024, 4096, 1, 3207] - - [319, 9912.81] + - [453, 9912.81] - - [64, 228, 272, 232] - - [315, 7350.14] + - [449, 7350.14] - - [64, 23, 2720, 23] - - [296, 2640.25] + - [430, 2640.25] - - [4096, 1024, 1, 3487] - - [327, 9860.91] + - [461, 9860.91] - - [1024, 1024, 1, 3822] - - [325, 8993.96] + - [459, 8993.96] - - [64, 77, 816, 77] - - [297, 5273.19] + - [431, 5273.19] - - [4096, 1024, 1, 3431] - - [327, 9867.53] + - [461, 9867.53] - - [4096, 1024, 1, 3378] - - [326, 9888.14] + - [460, 9888.14] - - [4096, 1024, 1, 3529] - - [320, 9879.5] + - [454, 9879.5] - - [4096, 1024, 1, 3460] - - [327, 9877.25] + - [461, 9877.25] - - [1024, 4096, 1, 3336] - - [319, 9912.41] + - [453, 9912.41] - - [1024, 4096, 1, 3501] - - [320, 9914.4] + - [454, 9914.4] - - [64, 159, 400, 159] - - [310, 7016.51] + - [444, 7016.51] - - [1024, 4096, 1, 3584] - - [327, 9940.59] + - [461, 9940.59] - - [64, 135, 480, 134] - - [310, 6241.39] + - [444, 6241.39] - - [64, 99, 624, 99] - - [301, 5617.39] + - [435, 5617.39] - - [4096, 1024, 1, 2499] - - [326, 9813.57] + - [460, 9813.57] - - [1024, 1024, 1, 3942] - - [322, 9060.01] + - [456, 9060.01] - - [4096, 1024, 1, 3352] - - [326, 9867.12] + - [460, 9867.12] - - [1024, 4096, 1, 3543] - - [327, 9928.77] + - [461, 9928.77] - - [1024, 4096, 1, 3476] - - [326, 9931.58] + - [460, 9931.58] - - [1024, 33708, 1, 3822] - - [319, 10324.7] + - [453, 10324.7] - - [1024, 4096, 1, 3436] - - [319, 9917.28] + - [453, 9917.28] - - [1024, 1024, 1, 3861] - - [318, 8998.49] + - [452, 8998.49] - - [1024, 1024, 1, 4000] - - [323, 9058.3] + - [457, 9058.3] - - [1024, 4096, 1, 3594] - - [319, 9927.88] + - [453, 9927.88] - - [4096, 1024, 1, 3514] - - [327, 9872.3] + - [461, 9872.3] - - [1024, 4096, 1, 3064] - - [326, 9907.1] + - [460, 9907.1] - - [4096, 1024, 1, 3371] - - [319, 9857.74] + - [453, 9857.74] - - [4096, 1024, 1, 3558] - - [327, 9876.31] + - [461, 9876.31] - - [4096, 1024, 1, 3517] - - [326, 9866.45] + - [460, 9866.45] - - [4096, 1024, 1, 3144] - - [326, 9846.36] + - [460, 9846.36] - - [1024, 4096, 1, 3312] - - [319, 9932.85] + - [453, 9932.85] - - [4096, 1024, 1, 3079] - - [326, 9851.1] + - [460, 9851.1] - - [1024, 4096, 1, 3415] - - [319, 9919.47] + - [453, 9919.47] - - [1024, 4096, 1, 3221] - - [326, 9908.18] + - [460, 9908.18] - - [1024, 4096, 1, 3978] - - [320, 9944.41] + - [454, 9944.41] - - [4096, 1024, 1, 3876] - - [326, 9898.99] + - [460, 9898.99] - - [1024, 4096, 1, 3528] - - [319, 9919.6] + - [453, 9919.6] - - [1024, 4096, 1, 3181] - - [327, 9894.86] + - [461, 9894.86] - - [4096, 1024, 1, 3445] - - [326, 9878.54] + - [460, 9878.54] - - [4096, 1024, 1, 3450] - - [319, 9864.82] + - [453, 9864.82] - - [4096, 1024, 1, 3377] - - [326, 9879.69] + - [460, 9879.69] - - [1024, 4096, 1, 3532] - - [320, 9928.19] + - [454, 9928.19] - - [1024, 33708, 1, 3944] - - [319, 10329.7] + - [453, 10329.7] - - [4096, 1024, 1, 3483] - - [326, 9861.83] + - [460, 9861.83] - - [1024, 4096, 1, 3358] - - [319, 9903.69] + - [453, 9903.69] - - [4096, 1024, 1, 3464] - - [326, 9876.84] + - [460, 9876.84] - - [4096, 1024, 1, 3282] - - [319, 9859.23] + - [453, 9859.23] - - [4096, 1024, 1, 3256] - - [327, 9855.1] + - [461, 9855.1] - - [1024, 4096, 1, 3057] - - [326, 9910.75] + - [460, 9910.75] - - [4096, 1024, 1, 3481] - - [326, 9866.29] + - [460, 9866.29] - - [4096, 1024, 1, 3340] - - [326, 9862.25] + - [460, 9862.25] - - [1024, 1024, 1, 3870] - - [325, 9082.45] + - [459, 9082.45] - - [1024, 4096, 1, 3273] - - [319, 9916.29] + - [453, 9916.29] - - [64, 65, 992, 65] - - [310, 4683.01] + - [444, 4683.01] - - [4096, 1024, 1, 3392] - - [320, 9881.12] + - [454, 9881.12] - - [4096, 1024, 1, 3337] - - [326, 9864.5] + - [460, 9864.5] - - [4096, 1024, 1, 3359] - - [326, 9874.42] + - [460, 9874.42] - - [4096, 1024, 1, 3498] - - [327, 9864.35] + - [461, 9864.35] - - [4096, 1024, 1, 3169] - - [326, 9851.1] + - [460, 9851.1] - - [1024, 33708, 1, 3859] - - [320, 10332.6] + - [454, 10332.6] - - [64, 19, 3264, 19] - - [296, 2182.14] + - [430, 2182.14] - - [1024, 4096, 1, 3103] - - [319, 9898.9] + - [453, 9898.9] - - [4096, 1024, 1, 3900] - - [326, 9897.12] + - [460, 9897.12] - - [1024, 4096, 1, 3442] - - [326, 9938.97] + - [460, 9938.97] - - [1024, 4096, 1, 3248] - - [326, 9939.92] + - [460, 9939.92] - - [1024, 4096, 1, 3351] - - [327, 9923.23] + - [461, 9923.23] - - [4096, 1024, 1, 3593] - - [326, 9894.36] + - [460, 9894.36] - - [1024, 4096, 1, 3780] - - [326, 9941.96] + - [460, 9941.96] - - [64, 133, 480, 133] - - [310, 6180.79] + - [444, 6180.79] - - [1024, 33708, 1, 3681] - - [319, 10332.3] + - [453, 10332.3] - - [4096, 1024, 1, 3374] - - [320, 9859.36] + - [454, 9859.36] - - [1024, 4096, 1, 3557] - - [319, 9928.2] + - [453, 9928.2] - - [4096, 1024, 1, 3906] - - [326, 9907.07] + - [460, 9907.07] - - [4096, 1024, 1, 3504] - - [326, 9886.05] + - [460, 9886.05] - - [1024, 4096, 1, 3270] - - [326, 9916.37] + - [460, 9916.37] - - [4096, 1024, 1, 3098] - - [319, 9854.76] + - [453, 9854.76] - - [64, 232, 272, 232] - - [315, 7394.1] + - [449, 7394.1] - - [4096, 1024, 1, 3216] - - [327, 9876.57] + - [461, 9876.57] - - [64, 148, 432, 148] - - [312, 6663.85] + - [446, 6663.85] - - [1024, 4096, 1, 3550] - - [326, 9920.28] + - [460, 9920.28] - - [4096, 1024, 1, 3449] - - [320, 9870.57] + - [454, 9870.57] - - [1024, 4096, 1, 3403] - - [327, 9908.21] + - [461, 9908.21] - - [1024, 4096, 1, 3523] - - [326, 9932.71] + - [460, 9932.71] - - [1024, 4096, 1, 3486] - - [326, 9917.46] + - [460, 9917.46] - - [1024, 4096, 1, 3564] - - [326, 9923.44] + - [460, 9923.44] - - [1024, 33708, 1, 4005] - - [319, 10339.5] + - [453, 10339.5] - - [4096, 1024, 1, 3296] - - [326, 9879.78] + - [460, 9879.78] - - [1024, 4096, 1, 3263] - - [319, 9907.17] + - [453, 9907.17] - - [64, 25, 2512, 25] - - [296, 2848.17] + - [430, 2848.17] - - [1024, 4096, 1, 3130] - - [327, 9900.1] + - [461, 9900.1] - - [1024, 4096, 1, 3295] - - [327, 9895.45] + - [461, 9895.45] - - [1024, 33708, 1, 3925] - - [320, 10342.3] + - [454, 10342.3] - - [1024, 4096, 1, 3378] - - [319, 9921.37] + - [453, 9921.37] - - [4096, 1024, 1, 3720] - - [327, 9885.82] + - [461, 9885.82] - - [4096, 1024, 1, 3399] - - [326, 9880.65] + - [460, 9880.65] - - [4096, 1024, 1, 3543] - - [327, 9870.73] + - [461, 9870.73] - - [64, 9, 6544, 9] - - [299, 955.17] + - [433, 955.17] - - [4096, 1024, 1, 3497] - - [326, 9868.43] + - [460, 9868.43] - - [4096, 1024, 1, 3594] - - [327, 9876.88] + - [461, 9876.88] - - [1024, 4096, 1, 3144] - - [327, 9901.96] + - [461, 9901.96] - - [1024, 4096, 1, 3975] - - [320, 9950.19] + - [454, 9950.19] - - [4096, 1024, 1, 3205] - - [327, 9856.07] + - [461, 9856.07] - - [1024, 33708, 1, 3995] - - [319, 10331.1] + - [453, 10331.1] - - [1024, 4096, 1, 3392] - - [319, 9935.78] + - [453, 9935.78] - - [1024, 4096, 1, 3055] - - [327, 9893.25] + - [461, 9893.25] - - [1024, 4096, 1, 4026] - - [327, 9940.22] + - [461, 9940.22] - - [4096, 1024, 1, 3557] - - [326, 9884.0] + - [460, 9884.0] - - [4096, 1024, 1, 3515] - - [326, 9871.94] + - [460, 9871.94] - - [4096, 1024, 1, 3486] - - [327, 9860.74] + - [461, 9860.74] - - [4096, 1024, 1, 3457] - - [327, 9885.37] + - [461, 9885.37] - - [1024, 4096, 1, 3511] - - [319, 9928.24] + - [453, 9928.24] - - [4096, 1024, 1, 3138] - - [326, 9854.06] + - [460, 9854.06] - - [1024, 4096, 1, 3339] - - [320, 9912.89] + - [454, 9912.89] - - [1024, 4096, 1, 3939] - - [320, 9952.26] + - [454, 9952.26] - - [4096, 1024, 1, 3500] - - [320, 9863.62] + - [454, 9863.62] - - [4096, 1024, 1, 3395] - - [327, 9883.82] + - [461, 9883.82] - - [4096, 1024, 1, 3968] - - [327, 9920.36] + - [461, 9920.36] - - [4096, 1024, 1, 4020] - - [327, 9912.81] + - [461, 9912.81] - - [4096, 1024, 1, 3942] - - [326, 9910.17] + - [460, 9910.17] - - [1024, 1024, 1, 4032] - - [316, 9024.74] + - [450, 9024.74] - - [4096, 1024, 1, 3349] - - [327, 9866.04] + - [461, 9866.04] - - [1024, 4096, 1, 3322] - - [320, 9908.43] + - [454, 9908.43] - - [4096, 1024, 1, 3452] - - [326, 9872.69] + - [460, 9872.69] - - [1024, 4096, 1, 3417] - - [326, 9912.64] + - [460, 9912.64] - - [1024, 1024, 1, 4012] - - [324, 9085.47] + - [458, 9085.47] - - [1024, 4096, 1, 3526] - - [320, 9920.36] + - [454, 9920.36] - - [4096, 1024, 1, 3485] - - [320, 9861.64] + - [454, 9861.64] - - [1024, 1024, 1, 3681] - - [324, 8991.46] + - [458, 8991.46] - - [4096, 1024, 1, 3303] - - [327, 9861.3] + - [461, 9861.3] - - [4096, 1024, 1, 3344] - - [327, 9892.44] + - [461, 9892.44] - - [1024, 4096, 1, 3479] - - [327, 9921.77] + - [461, 9921.77] - - [4096, 1024, 1, 3300] - - [326, 9868.64] + - [460, 9868.64] - - [1024, 4096, 1, 3439] - - [320, 9918.29] + - [454, 9918.29] - - [4096, 1024, 1, 3280] - - [327, 9875.29] + - [461, 9875.29] - - [1024, 4096, 1, 3245] - - [319, 9910.49] + - [453, 9910.49] - - [1024, 4096, 1, 3328] - - [319, 9941.6] + - [453, 9941.6] - - [4096, 1024, 1, 3418] - - [319, 9870.76] + - [453, 9870.76] - - [1024, 4096, 1, 3493] - - [327, 9938.45] + - [461, 9938.45] - - [1024, 4096, 1, 3500] - - [319, 9916.93] + - [453, 9916.93] - - [1024, 4096, 1, 3166] - - [319, 9898.12] + - [453, 9898.12] - - [4096, 1024, 1, 3126] - - [320, 9847.04] + - [454, 9847.04] - - [1024, 4096, 1, 3277] - - [327, 9898.66] + - [461, 9898.66] - - [1024, 4096, 1, 3315] - - [326, 9923.11] + - [460, 9923.11] - - [1024, 1024, 1, 3927] - - [317, 8987.71] + - [451, 8987.71] - - [1024, 4096, 1, 3414] - - [319, 9916.01] + - [453, 9916.01] - - [4096, 1024, 1, 3531] - - [326, 9871.92] + - [460, 9871.92] - - [4096, 1024, 1, 3484] - - [319, 9867.86] + - [453, 9867.86] - - [1024, 4096, 1, 3180] - - [326, 9904.09] + - [460, 9904.09] - - [4096, 1024, 1, 3360] - - [326, 9879.57] + - [460, 9879.57] - - [1024, 33708, 1, 3990] - - [319, 10335.0] + - [453, 10335.0] - - [4096, 1024, 1, 3466] - - [326, 9875.02] + - [460, 9875.02] - - [1024, 4096, 1, 3428] - - [319, 9916.02] + - [453, 9916.02] - - [1024, 4096, 1, 3137] - - [326, 9913.27] + - [460, 9913.27] - - [4096, 1024, 1, 4059] - - [326, 9901.86] + - [460, 9901.86] - - [1024, 4096, 1, 3353] - - [326, 9914.6] + - [460, 9914.6] - - [1024, 4096, 1, 3942] - - [326, 9944.5] + - [460, 9944.5] - - [4096, 1024, 1, 3506] - - [319, 9875.75] + - [453, 9875.75] - - [1024, 1024, 1, 3894] - - [317, 8946.55] + - [451, 8946.55] - - [4096, 1024, 1, 3508] - - [327, 9877.67] + - [461, 9877.67] - - [64, 132, 480, 135] - - [310, 6164.86] + - [444, 6164.86] - - [4096, 1024, 1, 3956] - - [319, 9907.83] + - [453, 9907.83] - - [64, 7, 8192, 7] - - [298, 813.078] + - [432, 813.078] - - [1024, 4096, 1, 3272] - - [320, 9909.82] + - [454, 9909.82] - - [1024, 4096, 1, 3443] - - [327, 9929.83] + - [461, 9929.83] - - [1024, 4096, 1, 3375] - - [327, 9909.23] + - [461, 9909.23] - - [1024, 4096, 1, 3525] - - [327, 9929.27] + - [461, 9929.27] - - [4096, 1024, 1, 3472] - - [326, 9889.97] + - [460, 9889.97] - - [1024, 4096, 1, 3520] - - [319, 9947.79] + - [453, 9947.79] - - [4096, 1024, 1, 3322] - - [326, 9862.98] + - [460, 9862.98] - - [4096, 1024, 1, 3387] - - [326, 9861.62] + - [460, 9861.62] - - [64, 8, 7280, 8] - - [304, 1024.1] + - [438, 1024.1] - - [1024, 33708, 1, 3939] - - [319, 10339.9] + - [453, 10339.9] - - [4096, 1024, 1, 3345] - - [327, 9873.68] + - [461, 9873.68] - - [4096, 1024, 1, 2967] - - [326, 9839.21] + - [460, 9839.21] - - [1024, 4096, 1, 3453] - - [319, 9905.81] + - [453, 9905.81] - - [1024, 4096, 1, 3640] - - [326, 9934.05] + - [460, 9934.05] - - [4096, 1024, 1, 3291] - - [320, 9860.84] + - [454, 9860.84] - - [1024, 4096, 1, 3350] - - [327, 9918.03] + - [461, 9918.03] - - [4096, 1024, 1, 3417] - - [326, 9864.61] + - [460, 9864.61] - - [64, 135, 480, 135] - - [310, 6265.45] + - [444, 6265.45] - - [1024, 4096, 1, 3467] - - [320, 9906.95] + - [454, 9906.95] - - [1024, 4096, 1, 3491] - - [326, 9933.3] + - [460, 9933.3] - - [1024, 4096, 1, 3822] - - [326, 9938.75] + - [460, 9938.75] - - [4096, 1024, 1, 3292] - - [326, 9849.21] + - [460, 9849.21] - - [1024, 4096, 1, 3231] - - [319, 9905.82] + - [453, 9905.82] - - [1024, 4096, 1, 3364] - - [320, 9930.32] + - [454, 9930.32] - - [1024, 4096, 1, 3995] - - [320, 9943.76] + - [454, 9943.76] - - [1024, 4096, 1, 3545] - - [319, 9928.53] + - [453, 9928.53] - - [1024, 1024, 1, 3876] - - [317, 9003.04] + - [451, 9003.04] - - [1024, 4096, 1, 3186] - - [319, 9921.01] + - [453, 9921.01] - - [4096, 1024, 1, 3432] - - [326, 9875.29] + - [460, 9875.29] - - [64, 84, 752, 85] - - [297, 5704.51] + - [431, 5704.51] - - [4096, 1024, 1, 3367] - - [320, 9868.06] + - [454, 9868.06] - - [4096, 1024, 1, 3503] - - [327, 9871.01] + - [461, 9871.01] - - [1024, 4096, 1, 3095] - - [320, 9902.9] + - [454, 9902.9] - - [4096, 1024, 1, 3465] - - [327, 9872.17] + - [461, 9872.17] - - [1024, 4096, 1, 3402] - - [326, 9914.66] + - [460, 9914.66] - - [4096, 1024, 1, 3140] - - [326, 9847.95] + - [460, 9847.95] - - [1024, 1024, 1, 4050] - - [323, 9055.75] + - [457, 9055.75] - - [4096, 1024, 1, 3424] - - [320, 9894.62] + - [454, 9894.62] - - [4096, 1024, 1, 3257] - - [319, 9860.97] + - [453, 9860.97] - - [4096, 1024, 1, 2917] - - [326, 9845.91] + - [460, 9845.91] - - [1024, 33708, 1, 3640] - - [319, 10321.7] + - [453, 10321.7] - - [1024, 4096, 1, 3456] - - [319, 9950.35] + - [453, 9950.35] - - [1024, 4096, 1, 3014] - - [319, 9907.97] + - [453, 9907.97] - - [4096, 1024, 1, 3372] - - [327, 9868.37] + - [461, 9868.37] - - [64, 132, 480, 132] - - [310, 6121.62] + - [444, 6121.62] - - [1024, 4096, 1, 3294] - - [327, 9903.23] + - [461, 9903.23] - - [4096, 1024, 1, 3446] - - [327, 9871.69] + - [461, 9871.69] - - [1024, 4096, 1, 3389] - - [320, 9909.27] + - [454, 9909.27] - - [4096, 1024, 1, 3259] - - [326, 9860.76] + - [460, 9860.76] - - [4096, 1024, 1, 3544] - - [326, 9878.76] + - [460, 9878.76] - - [4096, 1024, 1, 3479] - - [327, 9873.97] + - [461, 9873.97] - - [4096, 1024, 1, 3542] - - [326, 9878.97] + - [460, 9878.97] - - [4096, 1024, 1, 3321] - - [319, 9861.13] + - [453, 9861.13] - - [1024, 4096, 1, 3147] - - [319, 9894.77] + - [453, 9894.77] - - [1024, 4096, 1, 3944] - - [319, 9950.51] + - [453, 9950.51] - - [4096, 1024, 1, 3870] - - [327, 9881.74] + - [461, 9881.74] - - [1024, 4096, 1, 3308] - - [319, 9907.26] + - [453, 9907.26] - - [4096, 1024, 1, 3401] - - [326, 9864.59] + - [460, 9864.59] - - [1024, 4096, 1, 3395] - - [319, 9929.03] + - [453, 9929.03] - - [64, 99, 624, 102] - - [295, 5651.36] + - [429, 5651.36] - - [1024, 4096, 1, 3563] - - [326, 9922.76] + - [460, 9922.76] - - [1024, 33708, 1, 3870] - - [319, 10325.4] + - [453, 10325.4] - - [4096, 1024, 1, 3494] - - [326, 9875.37] + - [460, 9875.37] - - [1024, 4096, 1, 3271] - - [319, 9913.09] + - [453, 9913.09] - - [1024, 33708, 1, 3910] - - [319, 10341.5] + - [453, 10341.5] - - [1024, 4096, 1, 3287] - - [327, 9924.87] + - [461, 9924.87] - - [1024, 33708, 1, 3860] - - [319, 10330.7] + - [453, 10330.7] - - [64, 143, 432, 148] - - [312, 6571.78] + - [446, 6571.78] - - [1024, 1024, 1, 3584] - - [324, 8975.31] + - [458, 8975.31] - - [64, 162, 400, 162] - - [314, 6822.26] + - [448, 6822.26] - - [4096, 1024, 1, 3341] - - [326, 9854.66] + - [460, 9854.66] - - [1024, 4096, 1, 3136] - - [319, 9926.86] + - [453, 9926.86] - - [4096, 1024, 1, 3439] - - [326, 9854.33] + - [460, 9854.33] - - [64, 148, 432, 147] - - [310, 6677.61] + - [444, 6677.61] - - [1024, 4096, 1, 3751] - - [326, 9938.48] + - [460, 9938.48] - - [1024, 4096, 1, 3301] - - [326, 9919.15] + - [460, 9919.15] - - [4096, 1024, 1, 3468] - - [327, 9859.83] + - [461, 9859.83] - - [1024, 4096, 1, 3416] - - [327, 9918.52] + - [461, 9918.52] - - [4096, 1024, 1, 3163] - - [326, 9854.65] + - [460, 9854.65] - - [1024, 4096, 1, 3230] - - [320, 9897.54] + - [454, 9897.54] - - [1024, 4096, 1, 3581] - - [320, 9915.48] + - [454, 9915.48] - - [1024, 1024, 1, 3960] - - [322, 9045.86] + - [456, 9045.86] - - [4096, 1024, 1, 3463] - - [327, 9884.74] + - [461, 9884.74] - - [1024, 4096, 1, 3478] - - [320, 9927.02] + - [454, 9927.02] - - [4096, 1024, 1, 3262] - - [326, 9852.22] + - [460, 9852.22] - - [1024, 4096, 1, 3438] - - [326, 9912.68] + - [460, 9912.68] - - [1024, 4096, 1, 3244] - - [319, 9900.51] + - [453, 9900.51] - - [1024, 4096, 1, 3445] - - [319, 9920.32] + - [453, 9920.32] - - [4096, 1024, 1, 3328] - - [326, 9888.07] + - [460, 9888.07] - - [1024, 4096, 1, 3492] - - [320, 9937.22] + - [454, 9937.22] - - [4096, 1024, 1, 3211] - - [320, 9847.95] + - [454, 9847.95] - - [1024, 4096, 1, 3910] - - [327, 9946.57] + - [461, 9946.57] - - [1024, 4096, 1, 3314] - - [319, 9932.6] + - [453, 9932.6] - - [4096, 1024, 1, 3859] - - [326, 9902.84] + - [460, 9902.84] - - [4096, 1024, 1, 3383] - - [326, 9875.2] + - [460, 9875.2] - - [1024, 4096, 1, 3409] - - [327, 9926.79] + - [461, 9926.79] - - [1024, 4096, 1, 4020] - - [319, 9941.8] + - [453, 9941.8] - - [4096, 1024, 1, 3530] - - [326, 9872.81] + - [460, 9872.81] - - [4096, 1024, 1, 3411] - - [327, 9875.02] + - [461, 9875.02] - - [1024, 4096, 1, 3566] - - [327, 9921.1] + - [461, 9921.1] - - [4096, 1024, 1, 3493] - - [319, 9875.74] + - [453, 9875.74] - - [4096, 1024, 1, 3184] - - [326, 9873.14] + - [460, 9873.14] - - [1024, 4096, 1, 3072] - - [319, 9923.79] + - [453, 9923.79] - - [1024, 4096, 1, 3431] - - [320, 9911.03] + - [454, 9911.03] - - [4096, 1024, 1, 3306] - - [327, 9853.42] + - [461, 9853.42] - - [1024, 4096, 1, 3352] - - [327, 9913.32] + - [461, 9913.32] - - [4096, 1024, 1, 3295] - - [326, 9862.68] + - [460, 9862.68] - - [64, 123, 528, 122] - - [292, 6950.25] + - [426, 6950.25] - - [1024, 4096, 1, 3517] - - [320, 9920.06] + - [454, 9920.06] - - [64, 102, 624, 101] - - [300, 5791.49] + - [434, 5791.49] - - [4096, 1024, 1, 3426] - - [326, 9891.14] + - [460, 9891.14] - - [4096, 1024, 1, 3385] - - [326, 9868.41] + - [460, 9868.41] - - [1024, 1024, 1, 3978] - - [317, 9008.48] + - [451, 9008.48] - - [4096, 1024, 1, 3572] - - [319, 9884.81] + - [453, 9884.81] - - [4096, 1024, 1, 3459] - - [326, 9892.17] + - [460, 9892.17] - - [1024, 4096, 1, 3374] - - [327, 9908.52] + - [461, 9908.52] - - [4096, 1024, 1, 3166] - - [326, 9832.45] + - [460, 9832.45] - - [4096, 1024, 1, 3093] - - [327, 9841.25] + - [461, 9841.25] - - [4096, 1024, 1, 3523] - - [320, 9879.05] + - [454, 9879.05] - - [4096, 1024, 1, 3413] - - [320, 9880.81] + - [454, 9880.81] - - [1024, 4096, 1, 3996] - - [319, 9948.14] + - [453, 9948.14] - - [1024, 4096, 1, 3452] - - [327, 9915.97] + - [461, 9915.97] - - [4096, 1024, 1, 3232] - - [327, 9876.54] + - [461, 9876.54] - - [4096, 1024, 1, 3400] - - [319, 9867.15] + - [453, 9867.15] - - [4096, 1024, 1, 3334] - - [326, 9868.99] + - [460, 9868.99] - - [1024, 4096, 1, 3345] - - [319, 9920.6] + - [453, 9920.6] - - [1024, 4096, 1, 3538] - - [326, 9933.34] + - [460, 9933.34] - - [1024, 4096, 1, 3466] - - [326, 9920.85] + - [460, 9920.85] - - [4096, 1024, 1, 3315] - - [326, 9876.87] + - [460, 9876.87] - - [4096, 1024, 1, 3214] - - [327, 9847.93] + - [461, 9847.93] - - [1024, 33708, 1, 3900] - - [319, 10331.7] + - [453, 10331.7] - - [64, 160, 400, 160] - - [312, 7440.61] + - [446, 7440.61] - - [1024, 4096, 1, 3367] - - [326, 9926.32] + - [460, 9926.32] - - [1024, 4096, 1, 2917] - - [327, 9904.57] + - [461, 9904.57] - - [1024, 1024, 1, 3995] - - [318, 9000.33] + - [452, 9000.33] - - [64, 132, 480, 134] - - [310, 6146.88] + - [444, 6146.88] - - [1024, 4096, 1, 3544] - - [327, 9924.14] + - [461, 9924.14] - - [4096, 1024, 1, 3414] - - [327, 9867.9] + - [461, 9867.9] - - [4096, 1024, 1, 3565] - - [320, 9870.13] + - [454, 9870.13] - - [1024, 4096, 1, 3512] - - [326, 9919.84] + - [460, 9919.84] - - [1024, 4096, 1, 3191] - - [327, 9914.79] + - [461, 9914.79] - - [64, 27, 2336, 27] - - [294, 3054.71] + - [428, 3054.71] - - [1024, 4096, 1, 3289] - - [327, 9917.2] + - [461, 9917.2] - - [4096, 1024, 1, 3290] - - [326, 9858.41] + - [460, 9858.41] - - [1024, 4096, 1, 3211] - - [327, 9897.16] + - [461, 9897.16] - - [1024, 33708, 1, 3969] - - [320, 10336.1] + - [454, 10336.1] - - [4096, 1024, 1, 3566] - - [326, 9863.0] + - [460, 9863.0] - - [64, 111, 576, 111] - - [300, 6400.91] + - [434, 6400.91] - - [1024, 4096, 1, 3459] - - [326, 9923.03] + - [460, 9923.03] - - [1024, 4096, 1, 3372] - - [319, 9909.86] + - [453, 9909.86] - - [4096, 1024, 1, 3339] - - [326, 9859.3] + - [460, 9859.3] - - [4096, 1024, 1, 3425] - - [326, 9889.34] + - [460, 9889.34] - - [4096, 1024, 1, 3388] - - [326, 9871.67] + - [460, 9871.67] - - [1024, 4096, 1, 3531] - - [319, 9919.0] + - [453, 9919.0] - - [4096, 1024, 1, 3286] - - [327, 9868.42] + - [461, 9868.42] - - [4096, 1024, 1, 3462] - - [326, 9881.88] + - [460, 9881.88] - - [1024, 4096, 1, 3388] - - [319, 9904.69] + - [453, 9904.69] - - [4096, 1024, 1, 3165] - - [319, 9836.33] + - [453, 9836.33] - - [4096, 1024, 1, 3304] - - [326, 9857.55] + - [460, 9857.55] - - [1024, 4096, 1, 2736] - - [326, 9901.07] + - [460, 9901.07] - - [4096, 1024, 1, 3397] - - [326, 9872.1] + - [460, 9872.1] - - [64, 38, 1680, 38] - - [293, 3459.52] + - [427, 3459.52] - - [1024, 4096, 1, 3311] - - [327, 9908.32] + - [461, 9908.32] - - [1024, 4096, 1, 3394] - - [327, 9929.43] + - [461, 9929.43] - - [4096, 1024, 1, 2736] - - [326, 9833.88] + - [460, 9833.88] - - [1024, 4096, 1, 3559] - - [320, 9925.33] + - [454, 9925.33] - - [4096, 1024, 1, 3180] - - [326, 9838.05] + - [460, 9838.05] - - [1024, 4096, 1, 3480] - - [319, 9922.46] + - [453, 9922.46] - - [4096, 1024, 1, 3318] - - [326, 9867.87] + - [460, 9867.87] - - [4096, 1024, 1, 3213] - - [326, 9846.02] + - [460, 9846.02] - - [1024, 4096, 1, 3286] - - [326, 9912.14] + - [460, 9912.14] - - [4096, 1024, 1, 3471] - - [326, 9874.24] + - [460, 9874.24] - - [1024, 4096, 1, 3381] - - [327, 9922.96] + - [461, 9922.96] - - [64, 100, 624, 100] - - [301, 5705.24] + - [435, 5705.24] - - [4096, 1024, 1, 3502] - - [326, 9872.44] + - [460, 9872.44] - - [64, 16, 3840, 16] - - [307, 2091.67] + - [441, 2091.67] - - [1024, 4096, 1, 3552] - - [319, 9943.89] + - [453, 9943.89] - - [4096, 1024, 1, 3519] - - [327, 9869.95] + - [461, 9869.95] - - [1024, 4096, 1, 3300] - - [320, 9916.15] + - [454, 9916.15] - - [1024, 4096, 1, 3419] - - [319, 9914.06] + - [453, 9914.06] - - [4096, 1024, 1, 4030] - - [320, 9893.73] + - [454, 9893.73] - - [4096, 1024, 1, 3976] - - [327, 9898.35] + - [461, 9898.35] - - [1024, 4096, 1, 3473] - - [327, 9928.42] + - [461, 9928.42] - - [1024, 1024, 1, 3977] - - [324, 9009.33] + - [458, 9009.33] - - [4096, 1024, 1, 3428] - - [326, 9876.79] + - [460, 9876.79] - - [1024, 4096, 1, 3433] - - [320, 9923.92] + - [454, 9923.92] - - [4096, 1024, 1, 3534] - - [320, 9864.0] + - [454, 9864.0] - - [4096, 1024, 1, 3461] - - [326, 9873.12] + - [460, 9873.12] - - [4096, 1024, 1, 3681] - - [326, 9898.57] + - [460, 9898.57] - - [4096, 1024, 1, 3495] - - [327, 9876.08] + - [461, 9876.08] - - [4096, 1024, 1, 3351] - - [326, 9879.71] + - [460, 9879.71] - - [1024, 4096, 1, 4059] - - [319, 9948.61] + - [453, 9948.61] - - [4096, 1024, 1, 3990] - - [326, 9900.76] + - [460, 9900.76] - - [1024, 4096, 1, 3325] - - [320, 9903.3] + - [454, 9903.3] - - [1024, 4096, 1, 3408] - - [326, 9932.15] + - [460, 9932.15] - - [64, 59, 1088, 59] - - [300, 5343.77] + - [434, 5343.77] - - [4096, 1024, 1, 3394] - - [327, 9878.17] + - [461, 9878.17] - - [1024, 4096, 1, 3573] - - [327, 9935.3] + - [461, 9935.3] - - [4096, 1024, 1, 3386] - - [326, 9866.38] + - [460, 9866.38] - - [4096, 1024, 1, 3540] - - [326, 9882.33] + - [460, 9882.33] - - [1024, 4096, 1, 3182] - - [320, 9894.45] + - [454, 9894.45] - - [1024, 4096, 1, 3430] - - [319, 9915.24] + - [453, 9915.24] - - [1024, 4096, 1, 3236] - - [327, 9920.56] + - [461, 9920.56] - - [4096, 1024, 1, 2977] - - [326, 9848.08] + - [460, 9848.08] - - [1024, 4096, 1, 3355] - - [326, 9908.78] + - [460, 9908.78] - - [4096, 1024, 1, 3139] - - [326, 9850.71] + - [460, 9850.71] - - [4096, 1024, 1, 3516] - - [320, 9874.21] + - [454, 9874.21] - - [4096, 1024, 1, 3368] - - [320, 9872.64] + - [454, 9872.64] - - [4096, 1024, 1, 3559] - - [319, 9884.32] + - [453, 9884.32] - - [64, 11, 5456, 11] - - [307, 1382.67] + - [441, 1382.67] - - [1024, 4096, 1, 3506] - - [326, 9937.69] + - [460, 9937.69] - - [1024, 4096, 1, 3145] - - [319, 9905.11] + - [453, 9905.11] - - [1024, 4096, 1, 3369] - - [326, 9912.71] + - [460, 9912.71] - - [64, 112, 576, 112] - - [292, 6583.56] + - [426, 6583.56] - - [4096, 1024, 1, 3522] - - [326, 9889.47] + - [460, 9889.47] - - [1024, 33708, 1, 3894] - - [319, 10337.5] + - [453, 10337.5] - - [64, 159, 400, 162] - - [310, 7057.09] + - [444, 7057.09] - - [4096, 1024, 1, 3336] - - [326, 9867.67] + - [460, 9867.67] - - [1024, 4096, 1, 3382] - - [320, 9915.9] + - [454, 9915.9] - - [4096, 1024, 1, 3533] - - [326, 9878.56] + - [460, 9878.56] - - [4096, 1024, 1, 4050] - - [327, 9916.82] + - [461, 9916.82] - - [4096, 1024, 1, 3480] - - [320, 9869.32] + - [454, 9869.32] - - [1024, 4096, 1, 3344] - - [319, 9935.61] + - [453, 9935.61] - - [64, 122, 528, 122] - - [292, 6871.14] + - [426, 6871.14] - - [1024, 4096, 1, 3509] - - [320, 9925.8] + - [454, 9925.8] - - [1024, 4096, 1, 3956] - - [319, 9958.26] + - [453, 9958.26] - - [4096, 1024, 1, 3616] - - [326, 9904.63] + - [460, 9904.63] - - [1024, 4096, 1, 3366] - - [319, 9919.47] + - [453, 9919.47] - - [4096, 1024, 1, 2935] - - [319, 9833.23] + - [453, 9833.23] - - [4096, 1024, 1, 3393] - - [326, 9877.45] + - [460, 9877.45] - - [4096, 1024, 1, 3547] - - [320, 9865.1] + - [454, 9865.1] - - [1024, 4096, 1, 3499] - - [327, 9912.49] + - [461, 9912.49] - - [4096, 1024, 1, 3357] - - [326, 9855.28] + - [460, 9855.28] - - [4096, 1024, 1, 3272] - - [326, 9861.97] + - [460, 9861.97] - - [4096, 1024, 1, 3207] - - [326, 9847.78] + - [460, 9847.78] - - [4096, 1024, 1, 3894] - - [326, 9918.86] + - [460, 9918.86] - - [1024, 4096, 1, 3444] - - [326, 9932.71] + - [460, 9932.71] - - [4096, 1024, 1, 3561] - - [326, 9872.61] + - [460, 9872.61] - - [4096, 1024, 1, 3376] - - [326, 9885.59] + - [460, 9885.59] - - [1024, 4096, 1, 3458] - - [326, 9929.39] + - [460, 9929.39] - - [4096, 1024, 1, 3231] - - [320, 9847.08] + - [454, 9847.08] - - [64, 228, 272, 228] - - [321, 7302.69] + - [455, 7302.69] - - [1024, 4096, 1, 3505] - - [327, 9931.63] + - [461, 9931.63] - - [4096, 1024, 1, 3277] - - [326, 9857.2] + - [460, 9857.2] - - [64, 21, 2976, 21] - - [296, 2436.14] + - [430, 2436.14] - - [1024, 4096, 1, 3391] - - [326, 9911.25] + - [460, 9911.25] - - [64, 32, 1984, 32] - - [308, 3572.17] + - [442, 3572.17] - - [1024, 4096, 1, 3536] - - [327, 9946.9] + - [461, 9946.9] - - [1024, 4096, 1, 3063] - - [326, 9906.92] + - [460, 9906.92] - - [1024, 1024, 1, 3925] - - [318, 9011.45] + - [452, 9011.45] - - [1024, 4096, 1, 3189] - - [320, 9900.95] + - [454, 9900.95] - - [1024, 4096, 1, 2505] - - [326, 9854.85] + - [460, 9854.85] - - [4096, 1024, 1, 3454] - - [319, 9864.96] + - [453, 9864.96] - - [1024, 4096, 1, 3405] - - [327, 9906.33] + - [461, 9906.33] - - [1024, 33708, 1, 4050] - - [320, 10343.7] + - [454, 10343.7] - - [4096, 1024, 1, 3520] - - [326, 9887.03] + - [460, 9887.03] - - [64, 93, 688, 93] - - [303, 6222.86] + - [437, 6222.86] - - [1024, 4096, 1, 3487] - - [327, 9918.69] + - [461, 9918.69] - - [1024, 4096, 1, 3558] - - [327, 9930.99] + - [461, 9930.99] - - [4096, 1024, 1, 3297] - - [326, 9874.31] + - [460, 9874.31] - - [1024, 1024, 1, 3840] - - [322, 9075.42] + - [456, 9075.42] - - [1024, 4096, 1, 3483] - - [326, 9915.38] + - [460, 9915.38] - - [1024, 1024, 1, 3956] - - [325, 9010.03] + - [459, 9010.03] - - [1024, 33708, 1, 3751] - - [320, 10325.9] + - [454, 10325.9] - - [4096, 1024, 1, 3380] - - [326, 9888.47] + - [460, 9888.47] - - [1024, 4096, 1, 3380] - - [319, 9927.25] + - [453, 9927.25] - - [1024, 4096, 1, 3396] - - [327, 9931.96] + - [461, 9931.96] - - [1024, 4096, 1, 3497] - - [320, 9914.86] + - [454, 9914.86] - - [1024, 4096, 1, 3502] - - [327, 9921.52] + - [461, 9921.52] - - [1024, 1024, 1, 3976] - - [322, 9060.3] + - [456, 9060.3] - - [1024, 4096, 1, 3138] - - [320, 9908.66] + - [454, 9908.66] - - [4096, 1024, 1, 3939] - - [319, 9910.23] + - [453, 9910.23] - - [1024, 4096, 1, 3303] - - [320, 9916.64] + - [454, 9916.64] - - [64, 111, 576, 112] - - [300, 6495.19] + - [434, 6495.19] - - [1024, 4096, 1, 3418] - - [326, 9913.35] + - [460, 9913.35] - - [1024, 4096, 1, 3224] - - [320, 9904.05] + - [454, 9904.05] - - [4096, 1024, 1, 3978] - - [326, 9896.28] + - [460, 9896.28] - - [1024, 4096, 1, 3472] - - [319, 9937.48] + - [453, 9937.48] - - [4096, 1024, 1, 3353] - - [327, 9863.97] + - [461, 9863.97] - - [4096, 1024, 1, 3362] - - [326, 9871.06] + - [460, 9871.06] - - [1024, 33708, 1, 3978] - - [319, 10325.4] + - [453, 10325.4] - - [64, 100, 624, 102] - - [295, 5695.67] + - [429, 5695.67] - - [1024, 4096, 1, 3432] - - [327, 9915.56] + - [461, 9915.56] - - [1024, 4096, 1, 3139] - - [326, 9914.21] + - [460, 9914.21] - - [1024, 4096, 1, 3341] - - [327, 9912.1] + - [461, 9912.1] - - [1024, 4096, 1, 3494] - - [320, 9924.6] + - [454, 9924.6] - - [1024, 4096, 1, 3969] - - [319, 9952.28] + - [453, 9952.28] - - [1024, 4096, 1, 3163] - - [327, 9911.79] + - [461, 9911.79] - - [1024, 1024, 1, 3955] - - [317, 9097.86] + - [451, 9097.86] - - [4096, 1024, 1, 3405] - - [326, 9853.84] + - [460, 9853.84] - - [1024, 1024, 1, 4030] - - [317, 9083.86] + - [451, 9083.86] - - [4096, 1024, 1, 3453] - - [326, 9858.88] + - [460, 9858.88] - - [1024, 4096, 1, 3411] - - [327, 9926.54] + - [461, 9926.54] - - [1024, 4096, 1, 3527] - - [320, 9922.65] + - [454, 9922.65] - - [4096, 1024, 1, 3474] - - [326, 9878.49] + - [460, 9878.49] - - [1024, 4096, 1, 3572] - - [326, 9932.0] + - [460, 9932.0] - - [4096, 1024, 1, 3293] - - [326, 9848.26] + - [460, 9848.26] - - [4096, 1024, 1, 3247] - - [326, 9861.45] + - [460, 9861.45] - - [64, 15, 4096, 15] - - [307, 1955.75] + - [441, 1955.75] - - [1024, 4096, 1, 3425] - - [327, 9936.4] + - [461, 9936.4] - - [1024, 4096, 1, 3354] - - [319, 9917.55] + - [453, 9917.55] - - [4096, 1024, 1, 3382] - - [326, 9885.49] + - [460, 9885.49] - - [4096, 1024, 1, 3236] - - [326, 9860.6] + - [460, 9860.6] - - [1024, 4096, 1, 3519] - - [327, 9919.3] + - [461, 9919.3] - - [4096, 1024, 1, 3354] - - [326, 9854.75] + - [460, 9854.75] - - [4096, 1024, 1, 3501] - - [327, 9869.62] + - [461, 9869.62] - - [1024, 1024, 1, 3906] - - [325, 9104.99] + - [459, 9104.99] - - [4096, 1024, 1, 3266] - - [326, 9873.97] + - [460, 9873.97] - - [64, 101, 624, 102] - - [295, 5765.52] + - [429, 5765.52] - - [1024, 4096, 1, 3368] - - [326, 9909.77] + - [460, 9909.77] - - [1024, 4096, 1, 4030] - - [327, 9940.27] + - [461, 9940.27] - - [1024, 4096, 1, 3533] - - [320, 9916.64] + - [454, 9916.64] - - [4096, 1024, 1, 3332] - - [327, 9876.45] + - [461, 9876.45] - - [4096, 1024, 1, 3584] - - [326, 9896.6] + - [460, 9896.6] - - [1024, 4096, 1, 3616] - - [326, 9957.18] + - [460, 9957.18] - - [4096, 1024, 1, 3265] - - [326, 9877.78] + - [460, 9877.78] - - [4096, 1024, 1, 3361] - - [326, 9888.61] + - [460, 9888.61] - - [4096, 1024, 1, 3467] - - [326, 9863.4] + - [460, 9863.4] - - [1024, 4096, 1, 3454] - - [320, 9904.89] + - [454, 9904.89] - - [1024, 4096, 1, 3101] - - [327, 9893.12] + - [461, 9893.12] - - [1024, 4096, 1, 3508] - - [327, 9931.54] + - [461, 9931.54] - - [4096, 1024, 1, 3267] - - [326, 9864.48] + - [460, 9864.48] - - [64, 54, 1184, 54] - - [292, 4906.02] + - [426, 4906.02] - - [4096, 1024, 1, 3419] - - [326, 9872.56] + - [460, 9872.56] - - [4096, 1024, 1, 3822] - - [326, 9892.63] + - [460, 9892.63] - - [1024, 4096, 1, 3266] - - [326, 9918.58] + - [460, 9918.58] - - [4096, 1024, 1, 3440] - - [327, 9890.16] + - [461, 9890.16] - - [1024, 4096, 1, 3361] - - [326, 9930.97] + - [460, 9930.97] - - [1024, 4096, 1, 3546] - - [320, 9926.56] + - [454, 9926.56] - - [4096, 1024, 1, 3473] - - [326, 9889.06] + - [460, 9889.06] - - [4096, 1024, 1, 3546] - - [327, 9872.27] + - [461, 9872.27] - - [1024, 4096, 1, 3088] - - [320, 9918.03] + - [454, 9918.03] - - [1024, 4096, 1, 3535] - - [327, 9921.2] + - [461, 9921.2] - - [1024, 4096, 1, 3447] - - [327, 9920.63] + - [461, 9920.63] - - [1024, 4096, 1, 3560] - - [326, 9925.48] + - [460, 9925.48] - - [1024, 4096, 1, 3422] - - [320, 9922.21] + - [454, 9922.21] - - [1024, 4096, 1, 3469] - - [319, 9906.18] + - [453, 9906.18] - - [4096, 1024, 1, 3488] - - [326, 9903.26] + - [460, 9903.26] - - [1024, 4096, 1, 3110] - - [326, 9906.76] + - [460, 9906.76] - - [1024, 4096, 1, 3265] - - [327, 9916.69] + - [461, 9916.69] - - [1024, 4096, 1, 3291] - - [326, 9902.73] + - [460, 9902.73] - - [1024, 4096, 1, 3390] - - [327, 9907.22] + - [461, 9907.22] - - [4096, 1024, 1, 3046] - - [326, 9847.68] + - [460, 9847.68] - - [1024, 4096, 1, 3539] - - [327, 9933.49] + - [461, 9933.49] - - [4096, 1024, 1, 3221] - - [327, 9860.74] + - [461, 9860.74] - - [4096, 1024, 1, 3433] - - [326, 9872.74] + - [460, 9872.74] - - [4096, 1024, 1, 3364] - - [327, 9881.91] + - [461, 9881.91] - - [4096, 1024, 1, 3470] - - [326, 9858.56] + - [460, 9858.56] - - [1024, 4096, 1, 3404] - - [319, 9907.27] + - [453, 9907.27] - - [1024, 33708, 1, 3968] - - [320, 10350.3] + - [454, 10350.3] - - [4096, 1024, 1, 3088] - - [326, 9869.06] + - [460, 9869.06] - - [1024, 4096, 1, 3247] - - [326, 9901.02] + - [460, 9901.02] - - [1024, 33708, 1, 3996] - - [319, 10328.5] + - [453, 10328.5] - - [4096, 1024, 1, 3482] - - [327, 9866.99] + - [461, 9866.99] - - [1024, 1024, 1, 3796] - - [322, 9031.68] + - [456, 9031.68] - - [4096, 1024, 1, 3995] - - [327, 9896.78] + - [461, 9896.78] - - [1024, 1024, 1, 3859] - - [324, 9097.36] + - [458, 9097.36] - - [1024, 4096, 1, 3280] - - [320, 9934.05] + - [454, 9934.05] - - [4096, 1024, 1, 3271] - - [327, 9860.09] + - [461, 9860.09] - - [64, 10, 5952, 10] - - [307, 1221.02] + - [441, 1221.02] - - [4096, 1024, 1, 3545] - - [326, 9877.35] + - [460, 9877.35] - - [4096, 1024, 1, 3476] - - [319, 9882.57] + - [453, 9882.57] - - [4096, 1024, 1, 3496] - - [320, 9880.5] + - [454, 9880.5] - - [4096, 1024, 1, 3191] - - [320, 9858.7] + - [454, 9858.7] - - [4096, 1024, 1, 3311] - - [327, 9853.2] + - [461, 9853.2] - - [1024, 4096, 1, 3302] - - [327, 9919.32] + - [461, 9919.32] - - [1024, 4096, 1, 3681] - - [326, 9944.99] + - [460, 9944.99] - - [4096, 1024, 1, 3582] - - [319, 9869.77] + - [453, 9869.77] - - [4096, 1024, 1, 3421] - - [327, 9856.08] + - [461, 9856.08] - - [4096, 1024, 1, 3560] - - [320, 9884.48] + - [454, 9884.48] - - [1024, 4096, 1, 3495] - - [327, 9930.13] + - [461, 9930.13] - - [4096, 1024, 1, 3186] - - [326, 9870.59] + - [460, 9870.59] - - [4096, 1024, 1, 3925] - - [326, 9904.0] + - [460, 9904.0] - - [64, 71, 896, 71] - - [311, 5004.79] + - [445, 5004.79] - - [1024, 4096, 1, 3435] - - [327, 9916.58] + - [461, 9916.58] - - [4096, 1024, 1, 3434] - - [326, 9871.29] + - [460, 9871.29] - - [1024, 33708, 1, 4012] - - [319, 10332.5] + - [453, 10332.5] - - [1024, 4096, 1, 3340] - - [319, 9918.11] + - [453, 9918.11] - - [1024, 1024, 1, 3860] - - [317, 8999.36] + - [451, 8999.36] - - [4096, 1024, 1, 3489] - - [326, 9882.02] + - [460, 9882.02] - - [1024, 4096, 1, 3162] - - [327, 9906.28] + - [461, 9906.28] - - [4096, 1024, 1, 3436] - - [326, 9858.12] + - [460, 9858.12] - - [1024, 1024, 1, 4005] - - [323, 9043.06] + - [457, 9043.06] - - [64, 84, 752, 84] - - [296, 5629.93] + - [430, 5629.93] - - [4096, 1024, 1, 3574] - - [326, 9886.7] + - [460, 9886.7] - - [4096, 1024, 1, 3469] - - [319, 9856.26] + - [453, 9856.26] - - [1024, 4096, 1, 3410] - - [320, 9924.74] + - [454, 9924.74] - - [1024, 4096, 1, 3216] - - [319, 9930.67] + - [453, 9930.67] - - [4096, 1024, 1, 3095] - - [326, 9847.01] + - [460, 9847.01] - - [1024, 1024, 1, 3990] - - [325, 9089.04] + - [459, 9089.04] - - [4096, 1024, 1, 3448] - - [326, 9863.94] + - [460, 9863.94] - - [1024, 4096, 1, 3176] - - [327, 9914.01] + - [461, 9914.01] - - [64, 49, 1296, 49] - - [292, 4437.46] + - [426, 4437.46] - - [4096, 1024, 1, 2918] - - [326, 9830.93] + - [460, 9830.93] - - [64, 14, 4368, 14] - - [306, 1802.47] + - [440, 1802.47] - - [1024, 4096, 1, 3424] - - [326, 9934.05] + - [460, 9934.05] - - [4096, 1024, 1, 3402] - - [319, 9863.12] + - [453, 9863.12] - - [4096, 1024, 1, 3145] - - [320, 9856.56] + - [454, 9856.56] - - [64, 134, 480, 134] - - [312, 6184.05] + - [446, 6184.05] - - [1024, 33708, 1, 3976] - - [320, 10330.1] + - [454, 10330.1] - - [4096, 1024, 1, 3518] - - [319, 9856.07] + - [453, 9856.07] - - [4096, 1024, 1, 3110] - - [326, 9856.46] + - [460, 9856.46] - - [4096, 1024, 1, 3325] - - [326, 9852.36] + - [460, 9852.36] - - [1024, 33708, 1, 3999] - - [319, 10329.7] + - [453, 10329.7] - - [4096, 1024, 1, 2985] - - [326, 9837.3] + - [460, 9837.3] - - [1024, 4096, 1, 3371] - - [319, 9913.03] + - [453, 9913.03] - - [4096, 1024, 1, 3342] - - [326, 9863.16] + - [460, 9863.16] - - [4096, 1024, 1, 3141] - - [320, 9849.91] + - [454, 9849.91] - - [4096, 1024, 1, 3532] - - [320, 9866.3] + - [454, 9866.3] - - [64, 78, 816, 78] - - [297, 5316.88] + - [431, 5316.88] - - [1024, 4096, 1, 3169] - - [327, 9910.45] + - [461, 9910.45] - - [1024, 4096, 1, 3514] - - [326, 9918.0] + - [460, 9918.0] - - [4096, 1024, 1, 3780] - - [327, 9899.75] + - [461, 9899.75] - - [1024, 4096, 1, 3098] - - [319, 9901.62] + - [453, 9901.62] - - [1024, 4096, 1, 3449] - - [327, 9919.85] + - [461, 9919.85] - - [1024, 4096, 1, 3222] - - [319, 9917.66] + - [453, 9917.66] - - [1024, 4096, 1, 3346] - - [320, 9912.91] + - [454, 9912.91] - - [4096, 1024, 1, 3064] - - [327, 9848.79] + - [461, 9848.79] - - [4096, 1024, 1, 3511] - - [326, 9873.39] + - [460, 9873.39] - - [4096, 1024, 1, 3384] - - [326, 9870.98] + - [460, 9870.98] - - [4096, 1024, 1, 3356] - - [320, 9853.45] + - [454, 9853.45] - - [1024, 4096, 1, 3796] - - [319, 9940.66] + - [453, 9940.66] - - [4096, 1024, 1, 3427] - - [326, 9883.14] + - [460, 9883.14] - - [4096, 1024, 1, 3390] - - [326, 9863.79] + - [460, 9863.79] - - [4096, 1024, 1, 3573] - - [327, 9886.02] + - [461, 9886.02] - - [4096, 1024, 1, 3456] - - [320, 9890.61] + - [454, 9890.61] - - [1024, 4096, 1, 3360] - - [327, 9938.1] + - [461, 9938.1] - - [1024, 33708, 1, 3977] - - [320, 10327.2] + - [454, 10327.2] - - [1024, 4096, 1, 2918] - - [319, 9902.84] + - [453, 9902.84] - - [4096, 1024, 1, 3975] - - [326, 9905.27] + - [460, 9905.27] - - [4096, 1024, 1, 3525] - - [327, 9879.91] + - [461, 9879.91] - - [4096, 1024, 1, 3398] - - [319, 9873.91] + - [453, 9873.91] - - [4096, 1024, 1, 3640] - - [326, 9885.16] + - [460, 9885.16] - - [1024, 1024, 1, 3999] - - [318, 8995.42] + - [452, 8995.42] - - [4096, 1024, 1, 3014] - - [326, 9841.32] + - [460, 9841.32] - - [1024, 4096, 1, 3446] - - [319, 9917.21] + - [453, 9917.21] - - [1024, 33708, 1, 3796] - - [319, 10339.0] + - [453, 10339.0] - - [4096, 1024, 1, 3101] - - [319, 9827.34] + - [453, 9827.34] - - [4096, 1024, 1, 3563] - - [327, 9863.03] + - [461, 9863.03] - - [4096, 1024, 1, 3539] - - [319, 9889.54] + - [453, 9889.54] - - [4096, 1024, 1, 3182] - - [326, 9833.79] + - [460, 9833.79] - - [1024, 4096, 1, 3468] - - [320, 9913.05] + - [454, 9913.05] - - [4096, 1024, 1, 3312] - - [326, 9889.85] + - [460, 9889.85] - - [4096, 1024, 1, 3215] - - [326, 9853.88] + - [460, 9853.88] - - [4096, 1024, 1, 3910] - - [326, 9894.72] + - [460, 9894.72] - - [1024, 33708, 1, 3780] - - [320, 10332.0] + - [454, 10332.0] - - [1024, 4096, 1, 3290] - - [326, 9915.08] + - [460, 9915.08] - - [1024, 4096, 1, 4012] - - [326, 9942.65] + - [460, 9942.65] - - [1024, 4096, 1, 3385] - - [326, 9915.83] + - [460, 9915.83] - - [1024, 33708, 1, 3975] - - [319, 10330.1] + - [453, 10330.1] - - [4096, 1024, 1, 3996] - - [326, 9891.31] + - [460, 9891.31] - - [4096, 1024, 1, 2765] - - [327, 9800.38] + - [461, 9800.38] - - [4096, 1024, 1, 3538] - - [327, 9886.22] + - [461, 9886.22] - - [4096, 1024, 1, 3415] - - [327, 9874.6] + - [461, 9874.6] - - [1024, 4096, 1, 3554] - - [326, 9931.99] + - [460, 9931.99] - - [4096, 1024, 1, 3513] - - [320, 9874.25] + - [454, 9874.25] - - [1024, 4096, 1, 3304] - - [320, 9907.73] + - [454, 9907.73] - - [4096, 1024, 1, 3294] - - [326, 9851.25] + - [460, 9851.25] - - [4096, 1024, 1, 3396] - - [327, 9880.7] + - [461, 9880.7] - - [1024, 4096, 1, 3213] - - [320, 9891.12] + - [454, 9891.12] - - [4096, 1024, 1, 3137] - - [320, 9857.41] + - [454, 9857.41] - - [4096, 1024, 1, 3552] - - [326, 9904.22] + - [460, 9904.22] - - [1024, 1024, 1, 4020] - - [325, 9098.87] + - [459, 9098.87] - - [64, 13, 4672, 13] - - [307, 1693.54] + - [441, 1693.54] - - [1024, 4096, 1, 3461] - - [326, 9918.45] + - [460, 9918.45] - - [4096, 1024, 1, 3263] - - [319, 9843.89] + - [453, 9843.89] - - [4096, 1024, 1, 3430] - - [326, 9885.26] + - [460, 9885.26] - - [4096, 1024, 1, 3389] - - [326, 9859.23] + - [460, 9859.23] - - [4096, 1024, 1, 3528] - - [326, 9873.01] + - [460, 9873.01] - - [1024, 4096, 1, 3463] - - [327, 9929.61] + - [461, 9929.61] - - [4096, 1024, 1, 3526] - - [327, 9876.9] + - [461, 9876.9] - - [4096, 1024, 1, 3154] - - [326, 9858.25] + - [460, 9858.25] - - [4096, 1024, 1, 3499] - - [327, 9862.92] + - [461, 9862.92] - - [1024, 1024, 1, 3939] - - [325, 9107.41] + - [459, 9107.41] - - [4096, 1024, 1, 3955] - - [327, 9906.28] + - [461, 9906.28] - - [1024, 4096, 1, 3297] - - [320, 9925.34] + - [454, 9925.34] - - [1024, 4096, 1, 3233] - - [326, 9920.65] + - [460, 9920.65] - - [1024, 4096, 1, 3226] - - [326, 9911.35] + - [460, 9911.35] - - [4096, 1024, 1, 3404] - - [326, 9867.28] + - [460, 9867.28] - - [4096, 1024, 1, 3355] - - [326, 9862.66] + - [460, 9862.66] - - [1024, 4096, 1, 3542] - - [326, 9926.49] + - [460, 9926.49] - - [4096, 1024, 1, 3181] - - [327, 9831.86] + - [461, 9831.86] - - [1024, 4096, 1, 3474] - - [326, 9928.03] + - [460, 9928.03] - - [4096, 1024, 1, 3319] - - [326, 9870.28] + - [460, 9870.28] - - [1024, 4096, 1, 3434] - - [319, 9917.51] + - [453, 9917.51] - - [1024, 4096, 1, 3860] - - [326, 9945.32] + - [460, 9945.32] - - [1024, 4096, 1, 3343] - - [319, 9914.66] + - [453, 9914.66] - - [64, 77, 816, 78] - - [297, 5276.97] + - [431, 5276.97] - - [1024, 4096, 1, 3488] - - [326, 9945.81] + - [460, 9945.81] - - [1024, 4096, 1, 3046] - - [326, 9908.78] + - [460, 9908.78] - - [1024, 4096, 1, 3141] - - [327, 9909.18] + - [461, 9909.18] - - [1024, 4096, 1, 3516] - - [327, 9911.38] + - [461, 9911.38] - - [4096, 1024, 1, 3147] - - [326, 9840.47] + - [460, 9840.47] - - [1024, 1024, 1, 4059] - - [318, 9009.78] + - [452, 9009.78] - - [1024, 1024, 1, 3944] - - [318, 9006.17] + - [452, 9006.17] - - [1024, 4096, 1, 3421] - - [327, 9919.86] + - [461, 9919.86] - - [4096, 1024, 1, 3944] - - [320, 9899.53] + - [454, 9899.53] - - [64, 45, 1424, 45] - - [305, 4068.67] + - [439, 4068.67] - - [1024, 4096, 1, 3574] - - [320, 9930.19] + - [454, 9930.19] - - [1024, 4096, 1, 3977] - - [319, 9944.28] + - [453, 9944.28] - - [1024, 1024, 1, 3968] - - [324, 9045.22] + - [458, 9045.22] - - [1024, 4096, 1, 2985] - - [326, 9887.65] + - [460, 9887.65] - - [64, 193, 320, 193] - - [313, 6631.35] + - [447, 6631.35] - - [1024, 4096, 1, 3427] - - [327, 9933.41] + - [461, 9933.41] - - [64, 12, 5040, 12] - - [307, 1552.53] + - [441, 1552.53] - - [1024, 4096, 1, 3482] - - [327, 9942.22] + - [461, 9942.22] - - [1024, 4096, 1, 3332] - - [319, 9923.58] + - [453, 9923.58] - - [1024, 1024, 1, 3720] - - [323, 9039.56] + - [457, 9039.56] - - [4096, 1024, 1, 3308] - - [327, 9852.66] + - [461, 9852.66] - - [1024, 4096, 1, 3513] - - [327, 9919.99] + - [461, 9919.99] - - [1024, 4096, 1, 3154] - - [320, 9908.46] + - [454, 9908.46] - - [1024, 4096, 1, 3955] - - [327, 9950.01] + - [461, 9950.01] - - [1024, 4096, 1, 2967] - - [327, 9897.44] + - [461, 9897.44] - - [1024, 33708, 1, 3942] - - [319, 10336.1] + - [453, 10336.1] - - [1024, 4096, 1, 3319] - - [327, 9912.45] + - [461, 9912.45] - - [4096, 1024, 1, 3860] - - [326, 9909.29] + - [460, 9909.29] - - [1024, 4096, 1, 3548] - - [319, 9924.21] + - [453, 9924.21] - - [4096, 1024, 1, 3977] - - [327, 9891.44] + - [461, 9891.44] - - [4096, 1024, 1, 3535] - - [326, 9867.84] + - [460, 9867.84] - - [1024, 4096, 1, 3541] - - [327, 9923.16] + - [461, 9923.16] - - [1024, 1024, 1, 3910] - - [324, 9080.4] + - [458, 9080.4] - - [1024, 33708, 1, 3584] - - [319, 10333.0] + - [453, 10333.0] - - [1024, 4096, 1, 3168] - - [320, 9926.27] + - [454, 9926.27] - - [1024, 4096, 1, 3448] - - [327, 9922.42] + - [461, 9922.42] - - [4096, 1024, 1, 3343] - - [326, 9857.23] + - [460, 9857.23] - - [64, 35, 1808, 35] - - [309, 3175.44] + - [443, 3175.44] - - [1024, 4096, 1, 3357] - - [320, 9902.41] + - [454, 9902.41] - - [64, 143, 432, 143] - - [310, 6489.7] + - [444, 6489.7] - - [4096, 1024, 1, 3510] - - [326, 9867.4] + - [460, 9867.4] - - [4096, 1024, 1, 3369] - - [326, 9863.44] + - [460, 9863.44] - - [64, 92, 688, 93] - - [297, 6188.3] + - [431, 6188.3] - - [4096, 1024, 1, 3379] - - [326, 9870.12] + - [460, 9870.12] - - [1024, 4096, 1, 3276] - - [326, 9904.77] + - [460, 9904.77] - - [1024, 4096, 1, 3363] - - [326, 9925.13] + - [460, 9925.13] - - [4096, 1024, 1, 3055] - - [326, 9831.92] + - [460, 9831.92] - - [1024, 4096, 1, 3524] - - [319, 9923.79] + - [453, 9923.79] - - [4096, 1024, 1, 3057] - - [326, 9852.87] + - [460, 9852.87] - - [1024, 33708, 1, 3720] - - [320, 10327.1] + - [454, 10327.1] - - [1024, 4096, 1, 3383] - - [319, 9919.39] + - [453, 9919.39] - - [1024, 4096, 1, 3522] - - [320, 9932.56] + - [454, 9932.56] - - [1024, 33708, 1, 3956] - - [319, 10333.8] + - [453, 10333.8] - - [1024, 4096, 1, 3481] - - [319, 9922.08] + - [453, 9922.08] - - [4096, 1024, 1, 3562] - - [327, 9874.86] + - [461, 9874.86] - - [4096, 1024, 1, 3299] - - [326, 9872.97] + - [460, 9872.97] - - [1024, 4096, 1, 3262] - - [320, 9924.83] + - [454, 9924.83] - - [1024, 4096, 1, 3840] - - [319, 9961.84] + - [453, 9961.84] - - [1024, 33708, 1, 4026] - - [319, 10334.3] + - [453, 10334.3] - - [4096, 1024, 1, 3168] - - [320, 9878.45] + - [454, 9878.45] - - [64, 101, 624, 101] - - [300, 5734.72] + - [434, 5734.72] - - [1024, 4096, 1, 3999] - - [319, 9947.1] + - [453, 9947.1] - - [1024, 4096, 1, 3549] - - [319, 9923.3] + - [453, 9923.3] - - [4096, 1024, 1, 3375] - - [326, 9868.89] + - [460, 9868.89] - - [1024, 4096, 1, 3496] - - [327, 9928.67] + - [461, 9928.67] - - [64, 29, 2176, 29] - - [296, 3290.02] + - [430, 3290.02] - - [1024, 4096, 1, 3190] - - [327, 9897.61] + - [461, 9897.61] - - [4096, 1024, 1, 3273] - - [327, 9853.65] + - [461, 9853.65] - - [1024, 4096, 1, 3406] - - [326, 9907.04] + - [460, 9907.04] - - [4096, 1024, 1, 4005] - - [319, 9907.97] + - [453, 9907.97] - - [4096, 1024, 1, 3555] - - [326, 9878.96] + - [460, 9878.96] - - [4096, 1024, 1, 2505] - - [326, 9785.1] + - [460, 9785.1] - - [1024, 4096, 1, 3460] - - [326, 9930.24] + - [460, 9930.24] - - [64, 17, 3632, 17] - - [297, 1917.27] + - [431, 1917.27] - - [1024, 4096, 1, 3579] - - [320, 9920.94] + - [454, 9920.94] - - [1024, 33708, 1, 4030] - - [320, 10327.7] + - [454, 10327.7] - - [1024, 4096, 1, 3510] - - [320, 9931.31] + - [454, 9931.31] - - [1024, 1024, 1, 3969] - - [317, 9020.83] + - [451, 9020.83] - - [1024, 4096, 1, 3282] - - [327, 9920.05] + - [461, 9920.05] - - [1024, 4096, 1, 3377] - - [319, 9927.34] + - [453, 9927.34] - - [1024, 4096, 1, 2935] - - [327, 9903.48] + - [461, 9903.48] - - [64, 41, 1552, 41] - - [297, 3740.48] + - [431, 3740.48] - - [1024, 4096, 1, 3498] - - [319, 9915.01] + - [453, 9915.01] - - [1024, 4096, 1, 3593] - - [326, 9925.64] + - [460, 9925.64] - - [1024, 1024, 1, 3948] - - [325, 9009.03] + - [459, 9009.03] - - [4096, 1024, 1, 3226] - - [327, 9854.75] + - [461, 9854.75] - - [1024, 4096, 1, 2499] - - [326, 9904.82] + - [460, 9904.82] - - [1024, 4096, 1, 3296] - - [319, 9926.89] + - [453, 9926.89] - - [1024, 4096, 1, 3455] - - [326, 9917.52] + - [460, 9917.52] - - [1024, 4096, 1, 3399] - - [320, 9919.7] + - [454, 9919.7] - - [1024, 4096, 1, 3205] - - [319, 9917.74] + - [453, 9917.74] - - [4096, 1024, 1, 4026] - - [327, 9897.81] + - [461, 9897.81] - - [1024, 4096, 1, 3484] - - [319, 9915.53] + - [453, 9915.53] - - [4096, 1024, 1, 3302] - - [327, 9862.8] + - [461, 9862.8] - - [1024, 4096, 1, 3485] - - [327, 9913.0] + - [461, 9913.0] - - [1024, 1024, 1, 3996] - - [325, 9008.77] + - [459, 9008.77] - - [1024, 4096, 1, 3126] - - [320, 9910.16] + - [454, 9910.16] - - [1024, 4096, 1, 4050] - - [319, 9951.21] + - [453, 9951.21] - - [4096, 1024, 1, 3235] - - [320, 9870.74] + - [454, 9870.74] - - [1024, 33708, 1, 3955] - - [319, 10336.1] + - [453, 10336.1] - - [1024, 4096, 1, 3342] - - [319, 9903.85] + - [453, 9903.85] - - [1024, 1024, 1, 3900] - - [324, 9082.92] + - [458, 9082.92] - - [1024, 4096, 1, 3397] - - [327, 9922.7] + - [461, 9922.7] - - [4096, 1024, 1, 3491] - - [327, 9880.75] + - [461, 9880.75] - - [1024, 4096, 1, 3503] - - [319, 9923.28] + - [453, 9923.28] - - [1024, 4096, 1, 3140] - - [320, 9908.41] + - [454, 9908.41] - - [4096, 1024, 1, 3121] - - [326, 9860.32] + - [460, 9860.32] - - [4096, 1024, 1, 3276] - - [326, 9854.19] + - [460, 9854.19] - - [1024, 4096, 1, 3321] - - [327, 9917.86] + - [461, 9917.86] - - [1024, 4096, 1, 3870] - - [327, 9931.07] + - [461, 9931.07] - - [4096, 1024, 1, 3475] - - [326, 9877.58] + - [460, 9877.58] - - [1024, 4096, 1, 2984] - - [326, 9895.59] + - [460, 9895.59] - - [4096, 1024, 1, 3363] - - [320, 9873.44] + - [454, 9873.44] - - [1024, 4096, 1, 3582] - - [326, 9920.87] + - [460, 9920.87] - - [4096, 1024, 1, 3509] - - [326, 9886.86] + - [460, 9886.86] - - [1024, 4096, 1, 3426] - - [319, 9928.86] + - [453, 9928.86] - - [4096, 1024, 1, 3136] - - [326, 9872.61] + - [460, 9872.61] - - [1024, 4096, 1, 3232] - - [327, 9926.29] + - [461, 9926.29] - - [4096, 1024, 1, 3103] - - [326, 9839.03] + - [460, 9839.03] - - [1024, 4096, 1, 3335] - - [320, 9913.37] + - [454, 9913.37] - - [1024, 4096, 1, 3900] - - [319, 9938.01] + - [453, 9938.01] - - [4096, 1024, 1, 3512] - - [320, 9877.26] + - [454, 9877.26] - - [4096, 1024, 1, 3222] - - [326, 9859.77] + - [460, 9859.77] - - [1024, 4096, 1, 3165] - - [326, 9899.71] + - [460, 9899.71] - - [4096, 1024, 1, 3408] - - [326, 9899.68] + - [460, 9899.68] - - [4096, 1024, 1, 3751] - - [326, 9891.49] + - [460, 9891.49] - - [1024, 4096, 1, 3318] - - [319, 9913.42] + - [453, 9913.42] - - [4096, 1024, 1, 3442] - - [327, 9880.21] + - [461, 9880.21] - - [1024, 4096, 1, 3413] - - [326, 9921.9] + - [460, 9921.9] - - [4096, 1024, 1, 3524] - - [326, 9879.22] + - [460, 9879.22] - - [1024, 4096, 1, 3976] - - [327, 9945.57] + - [461, 9945.57] - - [1024, 4096, 1, 3475] - - [327, 9932.51] + - [461, 9932.51] - - [1024, 4096, 1, 3534] - - [319, 9911.49] + - [453, 9911.49] - - [4096, 1024, 1, 3301] - - [326, 9872.75] + - [460, 9872.75] - - [4096, 1024, 1, 3248] - - [326, 9878.22] + - [460, 9878.22] - - [1024, 4096, 1, 2977] - - [320, 9899.93] + - [454, 9899.93] - - [4096, 1024, 1, 3346] - - [326, 9876.07] + - [460, 9876.07] - - [1024, 4096, 1, 3451] - - [319, 9920.16] + - [453, 9920.16] - - [1024, 4096, 1, 3257] - - [320, 9905.02] + - [454, 9905.02] - - [1024, 1024, 1, 3640] - - [318, 8983.39] + - [452, 8983.39] - - [1024, 4096, 1, 3356] - - [319, 9904.48] + - [453, 9904.48] - - [4096, 1024, 1, 3348] - - [327, 9872.53] + - [461, 9872.53] - - [4096, 1024, 1, 3335] - - [326, 9865.82] + - [460, 9865.82] - - [4096, 1024, 1, 3505] - - [326, 9888.88] + - [460, 9888.88] - - [1024, 4096, 1, 3490] - - [319, 9938.0] + - [453, 9938.0] - - [4096, 1024, 1, 3447] - - [326, 9865.39] + - [460, 9865.39] - - [1024, 4096, 1, 3267] - - [327, 9919.32] + - [461, 9919.32] - - [4096, 1024, 1, 3230] - - [326, 9853.2] + - [460, 9853.2] - - [4096, 1024, 1, 3455] - - [326, 9862.44] + - [460, 9862.44] - - [1024, 4096, 1, 3925] - - [319, 9945.64] + - [453, 9945.64] - - [1024, 4096, 1, 3362] - - [320, 9921.63] + - [454, 9921.63] - - [4096, 1024, 1, 3969] - - [327, 9911.98] + - [461, 9911.98] - - [4096, 1024, 1, 3527] - - [326, 9882.87] + - [460, 9882.87] - - [1024, 4096, 1, 3585] - - [320, 9946.52] + - [454, 9946.52] - - [4096, 1024, 1, 3063] - - [326, 9854.03] + - [460, 9854.03] - - [4096, 1024, 1, 3435] - - [326, 9867.13] + - [460, 9867.13] - - [4096, 1024, 1, 3366] - - [327, 9864.02] + - [461, 9864.02] - - [4096, 1024, 1, 3581] - - [319, 9868.57] + - [453, 9868.57] - - [1024, 33708, 1, 3906] - - [319, 10339.3] + - [453, 10339.3] - - [1024, 4096, 1, 3464] - - [327, 9916.21] + - [461, 9916.21] - - [1024, 4096, 1, 3440] - - [326, 9945.25] + - [460, 9945.25] - - [4096, 1024, 1, 3143] - - [326, 9846.76] + - [460, 9846.76] - - [1024, 4096, 1, 3349] - - [320, 9912.83] + - [454, 9912.83] - - [4096, 1024, 1, 3416] - - [326, 9885.13] + - [460, 9885.13] - - [4096, 1024, 1, 3365] - - [326, 9876.0] + - [460, 9876.0] - - [1024, 4096, 1, 3470] - - [327, 9914.98] + - [461, 9914.98] - - [4096, 1024, 1, 3287] - - [326, 9860.69] + - [460, 9860.69] - - [1024, 4096, 1, 3441] - - [327, 9928.98] + - [461, 9928.98] - - [4096, 1024, 1, 3224] - - [326, 9857.83] + - [460, 9857.83] - - [1024, 4096, 1, 3387] - - [319, 9911.72] + - [453, 9911.72] - - [1024, 4096, 1, 3547] - - [319, 9920.36] + - [453, 9920.36] - - [4096, 1024, 1, 3478] - - [320, 9882.9] + - [454, 9882.9] - - [4096, 1024, 1, 3548] - - [327, 9869.45] + - [461, 9869.45] - - [1024, 33708, 1, 4020] - - [319, 10345.3] + - [453, 10345.3] - - [4096, 1024, 1, 3320] - - [326, 9863.74] + - [460, 9863.74] - - [1024, 4096, 1, 3906] - - [326, 9942.67] + - [460, 9942.67] - - [4096, 1024, 1, 3796] - - [326, 9899.13] + - [460, 9899.13] - - [1024, 4096, 1, 3306] - - [319, 9902.4] + - [453, 9902.4] - - [1024, 4096, 1, 3401] - - [327, 9913.95] + - [461, 9913.95] - - [64, 147, 432, 147] - - [310, 6626.6] + - [444, 6626.6] - - [1024, 4096, 1, 3215] - - [327, 9911.24] + - [461, 9911.24] - - [4096, 1024, 1, 4012] - - [327, 9898.2] + - [461, 9898.2] - - [1024, 4096, 1, 2765] - - [327, 9863.73] + - [461, 9863.73] - - [4096, 1024, 1, 3554] - - [320, 9883.52] + - [454, 9883.52] - - [4096, 1024, 1, 3423] - - [326, 9866.72] + - [460, 9866.72] - - [1024, 1024, 1, 3751] - - [324, 9006.36] + - [458, 9006.36] - - [1024, 4096, 1, 3562] - - [320, 9922.08] + - [454, 9922.08] - - [1024, 4096, 1, 3489] - - [319, 9936.78] + - [453, 9936.78] - - [4096, 1024, 1, 3358] - - [326, 9858.22] + - [460, 9858.22] - - [4096, 1024, 1, 3270] - - [327, 9850.84] + - [461, 9850.84] - - [1024, 4096, 1, 3293] - - [319, 9905.33] + - [453, 9905.33] - - [1024, 4096, 1, 3376] - - [319, 9934.98] + - [453, 9934.98] - - [4096, 1024, 1, 3245] - - [326, 9852.52] + - [460, 9852.52] - - [4096, 1024, 1, 3541] - - [326, 9887.22] + - [460, 9887.22] - - [4096, 1024, 1, 3443] - - [326, 9871.73] + - [460, 9871.73] - - [4096, 1024, 1, 3438] - - [327, 9863.86] + - [461, 9863.86] - - [4096, 1024, 1, 3244] - - [326, 9859.76] + - [460, 9859.76] - - [1024, 4096, 1, 3365] - - [326, 9922.1] + - [460, 9922.1] - - [1024, 4096, 1, 3299] - - [320, 9923.38] + - [454, 9923.38] - - [4096, 1024, 1, 3840] - - [326, 9914.75] + - [460, 9914.75] - - [1024, 4096, 1, 3471] - - [327, 9918.38] + - [461, 9918.38] - - [1024, 4096, 1, 3398] - - [319, 9918.99] + - [453, 9918.99] - - [4096, 1024, 1, 3162] - - [326, 9843.93] + - [460, 9843.93] - - [1024, 4096, 1, 4005] - - [320, 9947.87] + - [454, 9947.87] - - [4096, 1024, 1, 3579] - - [326, 9868.25] + - [460, 9868.25] - - [64, 18, 3440, 18] - - [302, 2059.33] + - [436, 2059.33] - - [64, 177, 352, 177] - - [321, 7315.4] + - [455, 7315.4] - - [1024, 4096, 1, 3121] - - [327, 9930.34] + - [461, 9930.34] - - [4096, 1024, 1, 3441] - - [326, 9883.28] + - [460, 9883.28] - - [4096, 1024, 1, 3422] - - [326, 9858.41] + - [460, 9858.41] - - [4096, 1024, 1, 3444] - - [326, 9887.03] + - [460, 9887.03] - - [1024, 4096, 1, 3337] - - [320, 9911.45] + - [454, 9911.45] - - [4096, 1024, 1, 3550] - - [319, 9871.87] + - [453, 9871.87] - - [1024, 4096, 1, 3477] - - [319, 9930.65] + - [453, 9930.65] - - [4096, 1024, 1, 3490] - - [326, 9878.45] + - [460, 9878.45] - - [4096, 1024, 1, 3585] - - [326, 9893.63] + - [460, 9893.63] - - [1024, 4096, 1, 3143] - - [319, 9901.19] + - [453, 9901.19] - - [1024, 33708, 1, 3876] - - [320, 10330.8] + - [454, 10330.8] - - [1024, 4096, 1, 3320] - - [327, 9913.18] + - [461, 9913.18] - - [1024, 4096, 1, 3423] - - [327, 9914.14] + - [461, 9914.14] - - [1024, 4096, 1, 3894] - - [319, 9944.47] + - [453, 9944.47] - - [4096, 1024, 1, 3410] - - [326, 9878.67] + - [460, 9878.67] - - [1024, 4096, 1, 3561] - - [319, 9926.68] + - [453, 9926.68] - - [4096, 1024, 1, 3492] - - [320, 9872.92] + - [454, 9872.92] - - [64, 85, 752, 85] - - [297, 5734.35] + - [431, 5734.35] - - [36548, 1024, 1, 3712] - - [329, 10367.6] + - [463, 10367.6] - - [4096, 2048, 1, 128] - - [330, 8743.93] + - [464, 8743.93] - - [1024, 1024, 1, 3712] - - [331, 9976.29] + - [465, 9976.29] - - [1024, 1024, 1, 128] - - [328, 5765.47] + - [462, 5765.47] - - [4096, 3072, 1, 128] - - [330, 8869.11] + - [464, 8869.11] + - - [768, 3072, 1, 4096] + - [476, 10028.8] + - - [64, 256, 192, 256] + - [470, 8791.65] + - - [768, 2, 1, 16] + - [473, 5.05484] + - - [768, 768, 1, 64] + - [469, 3469.65] + - - [768, 768, 1, 4096] + - [477, 7475.1] + - - [768, 30522, 1, 1280] + - [480, 10297.0] + - - [64, 128, 384, 128] + - [470, 7660.93] + - - [768, 30522, 1, 320] + - [478, 10008.0] + - - [768, 768, 1, 32] + - [467, 2359.4] + - - [3072, 768, 1, 4096] + - [476, 10033.8] + - - [768, 30522, 1, 640] + - [479, 10206.8] + - - [64, 64, 768, 64] + - [468, 5494.82] + - - [768, 768, 1, 640] + - [477, 6721.74] + - - [768, 768, 1, 16] + - [466, 1203.82] + - - [768, 768, 1, 1280] + - [475, 7138.67] + - - [768, 2, 1, 32] + - [471, 11.9154] + - - [2048, 2048, 1, 512] + - [491, 9607.67] + - - [512, 32, 1, 200] + - [484, 422.368] + - - [1024, 1, 1, 200] + - [487, 24.7154] + - - [1600, 1024, 1, 512] + - [482, 8116.01] + - - [560, 1024, 1, 200] + - [481, 4810.84] + - - [1024, 1024, 1, 512] + - [490, 8614.84] + - - [2048, 1, 1, 512] + - [485, 81.0086] + - - [512, 512, 1, 200] + - [483, 4398.49] + - - [100, 2048, 1, 512] + - [488, 4443.22] + - - [1024, 1024, 1, 200] + - [489, 6990.61] + - - [1024, 64, 1, 512] + - [486, 2853.37] + - - [1024, 256, 1, 18944] + - [510, 9196.51] + - - [256, 3328, 1, 8976] + - [500, 8299.36] + - - [1024, 256, 1, 4352] + - [508, 8813.84] + - - [256, 9728, 1, 8976] + - [503, 9638.58] + - - [1024, 256, 1, 3072] + - [510, 8640.73] + - - [768, 2048, 1, 256] + - [502, 8663.03] + - - [1024, 256, 1, 19968] + - [507, 9220.96] + - - [256, 12800, 1, 8976] + - [497, 9418.52] + - - [1024, 256, 1, 3328] + - [511, 8682.58] + - - [256, 10240, 1, 8976] + - [504, 10137.8] + - - [1024, 256, 1, 15104] + - [509, 9167.13] + - - [256, 10496, 1, 8976] + - [497, 9858.48] + - - [1024, 256, 1, 2816] + - [512, 8575.81] + - - [1024, 256, 1, 4608] + - [507, 8861.31] + - - [256, 11264, 1, 8976] + - [494, 9627.79] + - - [1024, 256, 1, 6400] + - [507, 8985.33] + - - [1024, 256, 1, 16128] + - [507, 9170.36] + - - [256, 44505, 1, 8976] + - [501, 10331.9] + - - [256, 6144, 1, 8976] + - [504, 10395.1] + - - [1024, 256, 1, 5120] + - [509, 8881.63] + - - [1024, 256, 1, 7936] + - [512, 9023.24] + - - [256, 3840, 1, 8976] + - [499, 9541.38] + - - [1024, 256, 1, 21248] + - [507, 9209.82] + - - [1024, 256, 1, 12032] + - [509, 9156.27] + - - [256, 8192, 1, 8976] + - [506, 10374.5] + - - [1024, 256, 1, 3584] + - [508, 8712.3] + - - [1024, 256, 1, 14336] + - [509, 9162.61] + - - [256, 7168, 1, 8976] + - [495, 9554.96] + - - [1024, 256, 1, 13568] + - [507, 9165.14] + - - [256, 4096, 1, 8976] + - [499, 10146.7] + - - [1024, 256, 1, 4096] + - [508, 8783.98] + - - [256, 2560, 1, 8976] + - [498, 8381.66] + - - [256, 20992, 1, 8976] + - [497, 9989.96] + - - [256, 4352, 1, 8976] + - [498, 9635.02] + - - [256, 33536, 1, 8976] + - [497, 10218.2] + - - [256, 3584, 1, 8976] + - [499, 8924.6] + - - [256, 26112, 1, 8976] + - [498, 10272.4] + - - [256, 14336, 1, 8976] + - [502, 10217.4] + - - [1024, 256, 1, 14848] + - [509, 9185.29] + - - [1024, 256, 1, 8448] + - [510, 9025.99] + - - [1024, 256, 1, 28672] + - [507, 9256.5] + - - [1024, 256, 1, 5632] + - [507, 8932.79] + - - [256, 22016, 1, 8976] + - [502, 10152.0] + - - [1024, 256, 1, 33536] + - [507, 9243.17] + - - [256, 5120, 1, 8976] + - [493, 9418.15] + - - [256, 11520, 1, 8976] + - [500, 9701.1] + - - [256, 19968, 1, 8976] + - [498, 10228.1] + - - [1024, 256, 1, 5376] + - [509, 8892.62] + - - [1024, 256, 1, 22016] + - [507, 9244.34] + - - [256, 8960, 1, 8976] + - [498, 9841.41] + - - [1024, 256, 1, 15872] + - [507, 9223.25] + - - [256, 17408, 1, 8976] + - [502, 9785.87] + - - [256, 5632, 1, 8976] + - [502, 9564.32] + - - [256, 32512, 1, 8976] + - [501, 10358.0] + - - [256, 11008, 1, 8976] + - [494, 9445.23] + - - [1024, 256, 1, 6144] + - [509, 8955.91] + - - [256, 4864, 1, 8976] + - [494, 8979.45] + - - [256, 15104, 1, 8976] + - [497, 10007.1] + - - [1024, 256, 1, 9984] + - [507, 9110.53] + - - [256, 1280, 1, 8976] + - [493, 5944.44] + - - [1024, 256, 1, 1024] + - [509, 7005.2] + - - [1024, 256, 1, 9728] + - [509, 9066.29] + - - [1024, 256, 1, 10496] + - [507, 9118.15] + - - [256, 11776, 1, 8976] + - [504, 9911.74] + - - [256, 12544, 1, 8976] + - [497, 9235.35] + - - [1024, 256, 1, 17152] + - [507, 9152.31] + - - [1024, 256, 1, 11520] + - [509, 9146.87] + - - [1024, 256, 1, 21504] + - [509, 9207.52] + - - [256, 17152, 1, 8976] + - [496, 9654.81] + - - [1024, 256, 1, 17408] + - [507, 9181.27] + - - [256, 15872, 1, 8976] + - [505, 10086.5] + - - [256, 18688, 1, 8976] + - [498, 9612.57] + - - [256, 5888, 1, 8976] + - [502, 9988.43] + - - [512, 2048, 1, 256] + - [492, 7678.46] + - - [1024, 256, 1, 7680] + - [510, 9033.06] + - - [1024, 256, 1, 1280] + - [512, 7767.33] + - - [256, 14848, 1, 8976] + - [498, 9852.76] + - - [256, 9984, 1, 8976] + - [504, 9908.97] + - - [256, 20480, 1, 8976] + - [502, 10337.2] + - - [1024, 256, 1, 8192] + - [509, 9044.42] + - - [1024, 256, 1, 19712] + - [508, 9184.28] + - - [256, 13568, 1, 8976] + - [498, 9927.92] + - - [256, 13312, 1, 8976] + - [497, 9758.01] + - - [256, 2816, 1, 8976] + - [497, 9191.53] + - - [1024, 256, 1, 2304] + - [508, 8445.01] + - - [256, 21248, 1, 8976] + - [498, 10127.6] + - - [256, 16128, 1, 8976] + - [506, 10238.5] + - - [256, 512, 36, 98] + - [529, 7994.95] + - - [64, 192, 36, 25088] + - [598, 8613.99] + - - [128, 128, 64, 25] + - [528, 2540.25] + - - [256, 256, 64, 56] + - [529, 6924.66] + - - [512, 486, 36, 800] + - [536, 8994.94] + - - [512, 512, 36, 1568] + - [547, 9872.48] + - - [64, 192, 64, 3200] + - [592, 9295.99] + - - [256, 384, 36, 4096] + - [592, 9334.71] + - - [128, 256, 64, 32] + - [531, 4280.0] + - - [64, 128, 64, 23104] + - [598, 10103.2] + - - [128, 256, 64, 9] + - [522, 1709.73] + - - [256, 512, 36, 784] + - [532, 9520.83] + - - [256, 324, 36, 32] + - [570, 4473.48] + - - [512, 512, 36, 33] + - [541, 5925.27] + - - [16, 32, 36, 5760] + - [545, 1448.9] + - - [192, 384, 64, 128] + - [592, 8618.53] + - - [512, 512, 64, 72] + - [548, 8260.22] + - - [128, 128, 64, 1600] + - [521, 9008.48] + - - [512, 512, 36, 128] + - [592, 8871.72] + - - [192, 384, 64, 2304] + - [521, 9657.26] + - - [384, 256, 64, 450] + - [557, 9539.03] + - - [3, 64, 36, 6272] + - [545, 509.884] + - - [3, 64, 64, 2888] + - [574, 708.721] + - - [384, 256, 64, 2304] + - [557, 10287.6] + - - [512, 512, 64, 144] + - [592, 9226.8] + - - [256, 256, 36, 6272] + - [532, 9607.38] + - - [80, 192, 64, 4608] + - [593, 7348.03] + - - [64, 64, 36, 3136] + - [580, 5959.15] + - - [256, 384, 64, 2304] + - [557, 10283.5] + - - [512, 512, 36, 66] + - [541, 7618.18] + - - [128, 256, 64, 800] + - [567, 9611.25] + - - [64, 128, 36, 30] + - [523, 1242.71] + - - [192, 256, 36, 512] + - [592, 8658.07] + - - [256, 512, 64, 200] + - [592, 9153.97] + - - [256, 512, 64, 25] + - [570, 5349.98] + - - [3, 64, 64, 46208] + - [573, 808.662] + - - [128, 256, 36, 1568] + - [565, 8528.72] + - - [64, 128, 64, 11552] + - [598, 9997.1] + - - [128, 192, 64, 946] + - [592, 9198.48] + - - [64, 192, 64, 12800] + - [553, 9000.76] + - - [224, 224, 64, 128] + - [530, 6312.17] + - - [128, 256, 64, 288] + - [592, 8697.97] + - - [64, 64, 64, 826] + - [535, 6650.31] + - - [256, 384, 64, 1152] + - [567, 10106.9] + - - [3, 64, 64, 92416] + - [573, 812.131] + - - [32, 32, 36, 43808] + - [514, 2813.19] + - - [160, 320, 64, 288] + - [524, 8090.96] + - - [1, 16, 36, 23040] + - [561, 42.7667] + - - [128, 256, 36, 128] + - [539, 6049.58] + - - [128, 128, 64, 3360] + - [592, 9200.06] + - - [128, 128, 64, 420] + - [592, 8131.6] + - - [64, 128, 64, 361] + - [529, 6938.08] + - - [512, 512, 36, 16] + - [585, 3797.76] + - - [384, 256, 36, 800] + - [526, 9151.75] + - - [192, 384, 36, 4096] + - [526, 8867.67] + - - [64, 64, 64, 1600] + - [578, 7931.84] + - - [256, 384, 64, 576] + - [558, 9745.9] + - - [512, 512, 64, 14] + - [541, 3638.28] + - - [512, 512, 36, 8] + - [516, 2279.61] + - - [512, 486, 64, 128] + - [532, 8337.93] + - - [1, 16, 64, 640] + - [566, 50.0512] + - - [64, 96, 64, 288] + - [591, 5708.07] + - - [96, 96, 36, 1568] + - [560, 6866.85] + - - [256, 256, 36, 128] + - [564, 7703.92] + - - [64, 128, 36, 53824] + - [552, 6331.41] + - - [256, 256, 36, 32] + - [548, 4648.96] + - - [192, 256, 64, 288] + - [592, 8987.89] + - - [256, 256, 36, 16] + - [562, 2912.81] + - - [128, 256, 36, 3200] + - [565, 8680.37] + - - [160, 320, 64, 512] + - [524, 8449.54] + - - [128, 160, 36, 512] + - [535, 7215.07] + - - [96, 96, 36, 2592] + - [530, 7104.89] + - - [64, 96, 64, 800] + - [560, 7268.42] + - - [147, 64, 36, 18816] + - [576, 7116.36] + - - [160, 320, 36, 512] + - [530, 7874.92] + - - [256, 512, 36, 4] + - [569, 1034.88] + - - [96, 128, 64, 946] + - [552, 7901.17] + - - [256, 324, 64, 1568] + - [557, 8589.63] + - - [128, 128, 64, 50] + - [548, 4070.66] + - - [35, 96, 36, 8960] + - [542, 4207.4] + - - [32, 64, 36, 43808] + - [583, 4390.91] + - - [160, 224, 36, 128] + - [530, 5447.02] + - - [64, 64, 64, 81] + - [555, 2391.28] + - - [256, 256, 36, 3200] + - [521, 9559.65] + - - [256, 256, 36, 210] + - [532, 8414.71] + - - [192, 384, 64, 576] + - [592, 9468.85] + - - [512, 512, 64, 800] + - [567, 10096.5] + - - [512, 24, 36, 800] + - [518, 4761.87] + - - [64, 64, 64, 13216] + - [579, 8491.51] + - - [192, 224, 64, 1152] + - [535, 8769.16] + - - [256, 256, 64, 1152] + - [557, 9988.19] + - - [512, 486, 64, 512] + - [567, 9254.77] + - - [128, 128, 36, 784] + - [530, 7468.16] + - - [256, 512, 64, 1600] + - [554, 10232.6] + - - [512, 512, 64, 9] + - [548, 2599.88] + - - [96, 128, 64, 288] + - [560, 6599.53] + - - [64, 96, 36, 512] + - [560, 5073.85] + - - [256, 512, 36, 1568] + - [592, 9637.91] + - - [128, 128, 64, 400] + - [592, 8192.1] + - - [128, 128, 64, 800] + - [592, 8716.44] + - - [96, 128, 36, 512] + - [580, 6757.03] + - - [16, 32, 36, 360] + - [543, 754.136] + - - [128, 256, 64, 3200] + - [557, 10222.6] + - - [96, 128, 64, 800] + - [560, 7968.0] + - - [256, 512, 64, 4] + - [522, 1098.09] + - - [256, 256, 64, 450] + - [567, 9347.55] + - - [64, 64, 64, 3200] + - [578, 8518.18] + - - [192, 224, 64, 128] + - [538, 7035.27] + - - [128, 128, 64, 288] + - [592, 7751.38] + - - [256, 256, 64, 72] + - [548, 7489.93] + - - [96, 208, 36, 512] + - [560, 6939.21] + - - [128, 256, 36, 3136] + - [535, 8669.43] + - - [64, 64, 36, 3520] + - [530, 6007.57] + - - [64, 128, 36, 1568] + - [593, 6897.8] + - - [160, 320, 64, 242] + - [519, 7873.27] + - - [192, 192, 36, 512] + - [530, 7707.42] + - - [512, 512, 36, 512] + - [592, 9582.52] + - - [1, 16, 64, 10240] + - [544, 71.4511] + - - [128, 128, 36, 512] + - [530, 7149.48] + - - [512, 512, 36, 256] + - [521, 9384.5] + - - [512, 512, 36, 1024] + - [515, 9777.99] + - - [96, 208, 64, 1152] + - [593, 7851.0] + - - [128, 192, 64, 3200] + - [521, 9490.92] + - - [256, 256, 36, 4096] + - [526, 9585.56] + - - [160, 160, 64, 288] + - [560, 7299.9] + - - [256, 256, 64, 896] + - [557, 9850.43] + - - [128, 256, 64, 242] + - [592, 8391.48] + - - [128, 128, 36, 440] + - [535, 6274.82] + - - [96, 128, 36, 1568] + - [580, 7875.13] + - - [192, 384, 36, 1024] + - [526, 8715.82] + - - [64, 96, 36, 10368] + - [597, 7478.69] + - - [128, 256, 64, 100] + - [541, 7085.07] + - - [112, 224, 36, 2048] + - [534, 7556.02] + - - [384, 256, 64, 1152] + - [557, 10102.4] + - - [192, 384, 36, 128] + - [592, 7543.14] + - - [128, 128, 36, 7040] + - [565, 7600.7] + - - [128, 256, 64, 1568] + - [557, 10006.0] + - - [128, 128, 36, 1568] + - [549, 7848.4] + - - [128, 256, 64, 72] + - [572, 6553.7] + - - [256, 256, 36, 12544] + - [586, 9365.14] + - - [256, 256, 36, 105] + - [548, 7286.16] + - - [128, 256, 36, 392] + - [535, 7625.79] + - - [64, 64, 64, 5408] + - [578, 8882.77] + - - [3, 64, 36, 25088] + - [545, 529.042] + - - [384, 256, 36, 1024] + - [592, 9182.85] + - - [35, 96, 36, 13440] + - [599, 4110.39] + - - [128, 256, 64, 1152] + - [557, 9804.97] + - - [256, 324, 64, 32] + - [570, 5043.73] + - - [160, 224, 64, 128] + - [584, 6046.25] + - - [192, 224, 36, 2592] + - [582, 8878.78] + - - [96, 96, 64, 1152] + - [560, 8035.55] + - - [32, 64, 36, 90] + - [517, 964.565] + - - [64, 128, 64, 2888] + - [532, 9047.33] + - - [256, 384, 36, 800] + - [592, 9154.12] + - - [512, 512, 64, 4] + - [589, 1233.72] + - - [192, 320, 36, 128] + - [529, 7388.29] + - - [64, 128, 36, 480] + - [593, 5653.37] + - - [192, 384, 64, 242] + - [592, 9080.09] + - - [256, 486, 64, 32] + - [585, 5909.28] + - - [147, 64, 64, 9702] + - [594, 7319.79] + - - [512, 512, 64, 64] + - [528, 8179.12] + - - [64, 192, 64, 3698] + - [521, 9287.99] + - - [73, 192, 64, 10439] + - [552, 6668.12] + - - [1, 16, 36, 1440] + - [568, 33.5452] + - - [128, 256, 36, 512] + - [535, 7989.25] + - - [512, 512, 64, 576] + - [567, 9951.99] + - - [64, 64, 36, 12544] + - [583, 5872.87] + - - [128, 128, 36, 880] + - [580, 7597.36] + - - [192, 224, 36, 128] + - [538, 6451.3] + - - [64, 64, 64, 800] + - [578, 6916.83] + - - [64, 128, 36, 12544] + - [556, 6395.98] + - - [64, 64, 36, 1568] + - [530, 5536.76] + - - [160, 160, 36, 512] + - [530, 7345.36] + - - [512, 24, 64, 512] + - [520, 5242.98] + - - [3, 64, 36, 3136] + - [545, 475.452] + - - [256, 256, 64, 9] + - [570, 2106.61] + - - [3, 64, 64, 11552] + - [573, 785.227] + - - [128, 256, 36, 12544] + - [588, 8792.23] + - - [128, 128, 36, 3136] + - [549, 8098.56] + - - [256, 512, 36, 3136] + - [532, 9694.49] + - - [64, 64, 36, 196] + - [546, 2757.86] + - - [144, 288, 36, 512] + - [580, 7077.99] + - - [256, 24, 64, 32] + - [559, 1483.93] + - - [384, 384, 36, 800] + - [521, 9246.6] + - - [512, 512, 64, 1600] + - [567, 10277.4] + - - [112, 224, 36, 512] + - [535, 6744.88] + - - [128, 128, 36, 49] + - [541, 2716.39] + - - [512, 512, 36, 4] + - [569, 1156.62] + - - [35, 96, 64, 4235] + - [530, 4631.38] + - - [192, 384, 64, 450] + - [521, 9372.3] + - - [256, 256, 36, 1024] + - [592, 9346.74] + - - [112, 224, 64, 1152] + - [535, 7524.05] + - - [256, 512, 64, 400] + - [554, 9598.05] + - - [149, 32, 36, 19072] + - [599, 5811.9] + - - [128, 256, 36, 6272] + - [535, 8754.78] + - - [128, 192, 36, 1568] + - [560, 8195.2] + - - [256, 256, 36, 512] + - [592, 9074.32] + - - [256, 256, 64, 112] + - [592, 8305.65] + - - [512, 512, 64, 18] + - [585, 4324.12] + - - [256, 256, 64, 18] + - [548, 3547.91] + - - [256, 256, 64, 1568] + - [557, 10141.8] + - - [64, 96, 36, 1568] + - [578, 6805.76] + - - [384, 256, 36, 4096] + - [592, 9311.2] + - - [256, 512, 64, 800] + - [567, 9998.45] + - - [256, 384, 36, 2048] + - [592, 9285.44] + - - [3, 64, 36, 200704] + - [574, 547.475] + - - [384, 384, 64, 2304] + - [515, 9901.78] + - - [160, 320, 64, 128] + - [551, 7113.91] + - - [512, 512, 36, 528] + - [521, 9567.75] + - - [160, 320, 36, 128] + - [552, 6411.23] + - - [96, 96, 64, 800] + - [560, 7690.11] + - - [256, 512, 36, 49] + - [548, 6721.35] + - - [384, 384, 64, 450] + - [521, 9523.63] + - - [3, 64, 64, 23104] + - [573, 801.721] + - - [256, 256, 64, 3200] + - [557, 10300.5] + - - [128, 192, 36, 512] + - [535, 7499.85] + - - [192, 192, 64, 288] + - [592, 8774.34] + - - [96, 208, 64, 242] + - [552, 5902.09] + - - [256, 16, 36, 3200] + - [581, 3807.87] + - - [512, 512, 64, 8] + - [559, 2379.85] + - - [64, 128, 64, 5776] + - [532, 9332.84] + - - [512, 512, 64, 288] + - [521, 9522.09] + - - [256, 16, 36, 32] + - [577, 766.105] + - - [128, 192, 64, 288] + - [592, 8527.68] + - - [32, 64, 64, 640] + - [560, 4660.44] + - - [64, 64, 36, 392] + - [560, 3686.5] + - - [384, 384, 36, 1024] + - [526, 9282.58] + - - [64, 64, 36, 11552] + - [590, 5904.88] + - - [96, 128, 36, 6272] + - [580, 8351.09] + - - [128, 256, 36, 16] + - [562, 2144.91] + - - [256, 256, 64, 288] + - [592, 9140.23] + - - [64, 64, 64, 1652] + - [578, 7766.63] + - - [256, 384, 36, 1024] + - [526, 9203.37] + - - [96, 128, 64, 3200] + - [595, 8866.3] + - - [256, 324, 36, 3200] + - [534, 8194.35] + - - [128, 192, 64, 800] + - [592, 9198.13] + - - [64, 128, 64, 10] + - [533, 851.217] + - - [96, 208, 64, 288] + - [560, 6667.68] + - - [64, 96, 36, 2592] + - [542, 7216.98] + - - [64, 128, 64, 160] + - [571, 5191.07] + - - [192, 384, 64, 512] + - [521, 9446.14] + - - [64, 64, 36, 6272] + - [530, 6212.11] + - - [512, 24, 36, 288] + - [527, 3922.57] + - - [128, 128, 64, 1568] + - [521, 9037.96] + - - [112, 224, 64, 242] + - [591, 6399.36] + - - [128, 256, 64, 1600] + - [557, 10010.4] + - - [32, 32, 64, 20000] + - [525, 4378.51] + - - [160, 192, 64, 288] + - [552, 7803.73] + - - [512, 24, 64, 128] + - [513, 3733.9] + - - [512, 512, 36, 32] + - [548, 5935.44] + - - [3, 64, 36, 100352] + - [545, 542.883] + - - [3, 64, 64, 1444] + - [574, 674.259] + - - [512, 512, 36, 3136] + - [515, 9921.2] + - - [128, 256, 64, 6400] + - [575, 10349.4] + - - [256, 256, 36, 2048] + - [592, 9519.09] + - - [128, 160, 64, 288] + - [535, 7549.85] + - - [256, 256, 64, 6400] + - [557, 10392.7] + - - [32, 64, 64, 20000] + - [583, 6493.96] + - - [256, 256, 36, 1680] + - [532, 9513.39] + - - [128, 128, 64, 210] + - [592, 7094.2] + - - [192, 384, 36, 2048] + - [521, 8818.75] + - - [256, 256, 64, 144] + - [592, 8608.71] + - - [384, 384, 36, 4096] + - [526, 9357.04] + - - [160, 320, 64, 1152] + - [552, 8749.58] + - - [384, 256, 36, 2048] + - [592, 9279.73] + - - [256, 512, 36, 392] + - [592, 9252.24] + - - [256, 512, 64, 50] + - [548, 7511.39] + - - [73, 192, 36, 23360] + - [596, 5803.03] + - - [3, 64, 36, 50176] + - [545, 542.137] + - - [384, 384, 36, 2048] + - [521, 9325.9] + - - [256, 384, 64, 450] + - [567, 9528.76] + - - [192, 320, 64, 128] + - [526, 8399.91] + - - [128, 256, 36, 32] + - [541, 3276.9] + - - [160, 192, 36, 512] + - [580, 7752.44] + - - [512, 512, 64, 256] + - [532, 9473.74] + - - [256, 512, 64, 32] + - [570, 6391.42] + - - [384, 384, 64, 576] + - [521, 9614.89] + - - [64, 64, 64, 648] + - [578, 6282.25] + - - [512, 486, 36, 288] + - [592, 8625.03] + - - [32, 64, 36, 1440] + - [530, 3961.6] + - - [144, 288, 64, 242] + - [552, 6347.12] + - - [384, 256, 64, 576] + - [557, 9775.34] + - - [512, 512, 36, 64] + - [528, 7791.38] + - - [448, 384, 64, 128] + - [521, 9132.33] + - - [64, 128, 64, 722] + - [571, 8047.21] + - - [144, 288, 64, 288] + - [580, 6859.5] + - - [512, 512, 64, 224] + - [592, 9427.39] + - - [112, 224, 64, 288] + - [591, 6737.02] + - - [384, 384, 64, 1152] + - [515, 9820.56] + - - [448, 384, 36, 128] + - [592, 8761.41] + - - [64, 64, 64, 100] + - [538, 2708.2] + - - [256, 486, 36, 128] + - [564, 7640.14] + - - [64, 96, 64, 4608] + - [593, 8351.59] + - - [16, 32, 64, 160] + - [517, 736.46] + - - [64, 192, 36, 6272] + - [593, 8041.29] + - - [64, 64, 64, 200] + - [546, 3924.41] + - - [256, 256, 36, 800] + - [592, 9299.65] + - - [64, 128, 36, 6272] + - [590, 6816.46] + - - [32, 64, 64, 40] + - [537, 885.722] + - - [256, 16, 64, 32] + - [587, 1205.36] + - - [192, 384, 36, 800] + - [526, 8673.98] + - - [128, 128, 36, 3200] + - [560, 8538.99] + - - [256, 256, 36, 256] + - [532, 8454.46] + - - [192, 384, 64, 1152] + - [521, 9589.11] + - - [128, 256, 64, 200] + - [531, 8141.22] + - - [64, 96, 64, 1152] + - [560, 7620.98] + - - [128, 128, 36, 392] + - [535, 6175.61] + - - [80, 192, 36, 10368] + - [583, 6497.26] + - - [224, 224, 36, 128] + - [593, 5826.99] + - - [512, 512, 64, 28] + - [548, 5728.91] + - - [256, 16, 64, 1568] + - [563, 4637.3] + - - [144, 288, 64, 1152] + - [580, 7784.34] + - - [256, 256, 64, 576] + - [557, 9596.22] + - - [64, 128, 36, 784] + - [593, 6059.09] + - - [256, 24, 36, 128] + - [527, 2239.94] + - - [256, 256, 64, 2304] + - [557, 10225.8] + - - [192, 384, 36, 512] + - [592, 8549.13] + - - [16, 32, 64, 2560] + - [545, 2153.23] + - - [256, 512, 36, 32] + - [570, 5702.33] + - - [512, 512, 64, 128] + - [592, 9084.21] + - - [128, 128, 64, 200] + - [529, 6972.01] + - - [512, 512, 64, 32] + - [541, 6248.6] + - - [128, 256, 36, 196] + - [541, 6628.86] + - - [8, 384, 64, 6600] + - [573, 2733.99] + - - [149, 32, 64, 8195] + - [535, 6051.01] + - - [35, 96, 64, 6160] + - [580, 4689.45] + - - [64, 64, 36, 1760] + - [530, 5622.34] - null diff --git a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bljk_SB.yaml index 08dd4df6f..096950937 100644 --- a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bljk_SB.yaml @@ -32091,8 +32091,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32255,8 +32255,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32419,8 +32419,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32583,8 +32583,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32747,8 +32747,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -32911,8 +32911,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33075,8 +33075,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33239,8 +33239,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33399,8 +33399,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33563,8 +33563,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33723,8 +33723,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -33887,8 +33887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34051,8 +34051,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34215,8 +34215,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34379,8 +34379,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34543,8 +34543,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34707,8 +34707,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -34871,8 +34871,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35035,8 +35035,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35199,8 +35199,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35363,8 +35363,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35527,8 +35527,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35691,8 +35691,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -35855,8 +35855,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36019,8 +36019,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36186,8 +36186,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36349,8 +36349,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36516,8 +36516,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36679,8 +36679,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -36846,8 +36846,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37009,8 +37009,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37176,8 +37176,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37339,8 +37339,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37506,8 +37506,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37667,8 +37667,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37828,8 +37828,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -37991,8 +37991,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38158,8 +38158,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38323,8 +38323,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38486,8 +38486,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38653,8 +38653,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38816,8 +38816,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -38983,8 +38983,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39146,8 +39146,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39309,8 +39309,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39474,8 +39474,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39637,8 +39637,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39800,8 +39800,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -39965,8 +39965,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40128,8 +40128,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40291,8 +40291,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40452,8 +40452,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40613,8 +40613,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40774,8 +40774,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -40935,8 +40935,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41100,8 +41100,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41263,8 +41263,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41430,8 +41430,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41593,8 +41593,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41756,8 +41756,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -41915,8 +41915,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42078,8 +42078,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42239,8 +42239,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42404,8 +42404,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42565,8 +42565,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42726,8 +42726,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -42887,8 +42887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43052,8 +43052,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43213,8 +43213,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43374,8 +43374,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43535,8 +43535,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43696,8 +43696,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -43857,8 +43857,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44018,8 +44018,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44179,8 +44179,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44340,8 +44340,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44501,8 +44501,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44662,8 +44662,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44823,8 +44823,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -44984,8 +44984,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45145,8 +45145,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45306,8 +45306,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45467,8 +45467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45628,8 +45628,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45787,8 +45787,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -45947,8 +45947,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46107,8 +46107,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46267,8 +46267,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46427,8 +46427,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46587,8 +46587,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46747,8 +46747,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -46911,8 +46911,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47071,8 +47071,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47231,8 +47231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47391,8 +47391,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47551,8 +47551,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47711,8 +47711,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -47871,8 +47871,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48035,8 +48035,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48195,8 +48195,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48359,8 +48359,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48519,8 +48519,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48683,8 +48683,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -48843,8 +48843,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49003,8 +49003,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49163,8 +49163,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49323,8 +49323,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49483,8 +49483,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49647,8 +49647,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49811,8 +49811,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -49975,8 +49975,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50135,8 +50135,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50299,8 +50299,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50463,8 +50463,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50623,8 +50623,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50787,8 +50787,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -50951,8 +50951,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51111,8 +51111,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51275,8 +51275,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51439,8 +51439,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51603,8 +51603,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51763,8 +51763,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -51927,8 +51927,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52087,8 +52087,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52251,8 +52251,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52415,8 +52415,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52579,8 +52579,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52739,8 +52739,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -52903,8 +52903,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53067,8 +53067,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53231,8 +53231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53395,8 +53395,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53559,8 +53559,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53723,8 +53723,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -53887,8 +53887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54051,8 +54051,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54215,8 +54215,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54375,8 +54375,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54539,8 +54539,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54703,8 +54703,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -54867,8 +54867,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55031,8 +55031,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55195,8 +55195,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55359,8 +55359,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55519,8 +55519,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55679,8 +55679,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55839,8 +55839,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -55999,8 +55999,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56159,8 +56159,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56319,8 +56319,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56479,8 +56479,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56639,8 +56639,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56799,8 +56799,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -56959,8 +56959,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57119,8 +57119,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57279,8 +57279,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57443,8 +57443,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57607,8 +57607,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57767,8 +57767,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -57931,8 +57931,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58095,8 +58095,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58259,8 +58259,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58419,8 +58419,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58583,8 +58583,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58743,8 +58743,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -58907,8 +58907,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59071,8 +59071,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59231,8 +59231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59395,8 +59395,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59559,8 +59559,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59723,8 +59723,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -59887,8 +59887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60051,8 +60051,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60215,8 +60215,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60379,8 +60379,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60543,8 +60543,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60707,8 +60707,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -60871,8 +60871,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61035,8 +61035,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61199,8 +61199,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61363,8 +61363,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61527,8 +61527,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61691,8 +61691,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -61855,8 +61855,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62015,8 +62015,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62179,8 +62179,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62343,8 +62343,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62507,8 +62507,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62671,8 +62671,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62831,8 +62831,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -62991,8 +62991,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63155,8 +63155,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63319,8 +63319,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63483,8 +63483,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63647,8 +63647,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63807,8 +63807,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -63971,8 +63971,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64135,8 +64135,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64295,8 +64295,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64459,8 +64459,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64619,8 +64619,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64783,8 +64783,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -64943,8 +64943,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65103,8 +65103,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65267,8 +65267,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65431,8 +65431,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65595,8 +65595,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65759,8 +65759,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -65923,8 +65923,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66087,8 +66087,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66251,8 +66251,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66411,8 +66411,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66575,8 +66575,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66739,8 +66739,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -66903,8 +66903,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67067,8 +67067,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67231,8 +67231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67395,8 +67395,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67559,8 +67559,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67723,8 +67723,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -67887,8 +67887,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68051,8 +68051,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68215,8 +68215,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68379,8 +68379,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68543,8 +68543,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68707,8 +68707,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -68871,8 +68871,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69035,8 +69035,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69199,8 +69199,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69359,8 +69359,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69519,8 +69519,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69683,8 +69683,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -69843,8 +69843,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70003,8 +70003,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70167,8 +70167,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70327,8 +70327,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70491,8 +70491,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70651,8 +70651,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70811,8 +70811,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -70975,8 +70975,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71139,8 +71139,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71303,8 +71303,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71467,8 +71467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71631,8 +71631,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71795,8 +71795,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -71959,8 +71959,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72123,8 +72123,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72287,8 +72287,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72451,8 +72451,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72611,8 +72611,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72775,8 +72775,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -72939,8 +72939,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73103,8 +73103,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73267,8 +73267,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73431,8 +73431,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73595,8 +73595,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73759,8 +73759,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -73923,8 +73923,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74087,8 +74087,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74251,8 +74251,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74415,8 +74415,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74579,8 +74579,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74739,8 +74739,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -74903,8 +74903,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75067,8 +75067,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75231,8 +75231,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75395,8 +75395,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75555,8 +75555,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75715,8 +75715,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -75879,8 +75879,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76043,8 +76043,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76203,8 +76203,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76363,8 +76363,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76523,8 +76523,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76683,8 +76683,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -76843,8 +76843,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77003,8 +77003,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77163,8 +77163,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77327,8 +77327,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77487,8 +77487,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77647,8 +77647,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77811,8 +77811,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -77975,8 +77975,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78139,8 +78139,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78303,8 +78303,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78467,8 +78467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78631,8 +78631,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78795,8 +78795,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -78959,8 +78959,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79123,8 +79123,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79292,8 +79292,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79457,8 +79457,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79624,8 +79624,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79791,8 +79791,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -79958,8 +79958,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80125,8 +80125,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80294,8 +80294,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80459,8 +80459,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80628,8 +80628,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80795,8 +80795,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -80962,8 +80962,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81129,8 +81129,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81296,8 +81296,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81463,8 +81463,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81630,8 +81630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81795,8 +81795,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -81962,8 +81962,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82129,8 +82129,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82296,8 +82296,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82463,8 +82463,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82630,8 +82630,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82797,8 +82797,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -82966,8 +82966,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83133,8 +83133,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83300,8 +83300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83467,8 +83467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83634,8 +83634,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83801,8 +83801,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -83968,8 +83968,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84135,8 +84135,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84300,8 +84300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84467,8 +84467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84632,8 +84632,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84799,8 +84799,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -84964,8 +84964,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85131,8 +85131,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85300,8 +85300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85467,8 +85467,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85634,8 +85634,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85801,8 +85801,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -85966,8 +85966,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86133,8 +86133,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86300,8 +86300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86469,8 +86469,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86636,8 +86636,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86803,8 +86803,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -86968,8 +86968,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87135,8 +87135,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87304,8 +87304,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87471,8 +87471,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87638,8 +87638,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87805,8 +87805,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -87972,8 +87972,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -88139,8 +88139,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -88186,23 +88186,23 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -88210,37 +88210,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 4 - LSPB: 16 + LSPB: 32 LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -88248,10 +88245,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88259,26 +88256,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88288,6 +88293,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -88297,6 +88303,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88311,47 +88318,55 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 559 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id002 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -88359,37 +88374,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 4 - LSPB: 16 + LSPB: 32 LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -88397,10 +88409,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88408,26 +88420,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88437,6 +88457,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -88446,6 +88467,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88460,33 +88482,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 560 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -88501,7 +88531,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -88509,47 +88539,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88557,19 +88588,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -88577,6 +88615,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88586,6 +88625,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -88595,6 +88635,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88609,14 +88650,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 561 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -88627,63 +88675,69 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id004 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2560 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -88692,9 +88746,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88702,26 +88756,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88731,6 +88793,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -88740,6 +88803,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88754,48 +88818,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 562 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -88803,43 +88875,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4352 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 8 - MacroTileA: 128 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88847,26 +88924,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88876,6 +88959,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -88885,6 +88969,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88899,48 +88984,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 563 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id003 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id004 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -88948,43 +89043,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 16 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4352 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 8 - MacroTileA: 128 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88992,26 +89092,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89021,6 +89127,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -89030,6 +89137,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89044,14 +89152,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 564 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 - SubGroup0: 32 - SubGroup1: 2 - SubGroupA: 32 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -89062,74 +89177,82 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id007 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 16 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89137,26 +89260,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89166,6 +89297,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -89175,6 +89307,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89189,96 +89322,105 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 565 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id006 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 16 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89286,19 +89428,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -89306,6 +89453,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89315,6 +89463,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -89324,6 +89473,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89338,48 +89488,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 566 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id006 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -89387,43 +89547,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2304 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89431,20 +89596,19198 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 - PerformanceSyncLocation: -1 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 567 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 568 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 569 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 570 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 571 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 572 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 512 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 573 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 784 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 574 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR0_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 575 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2080 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 576 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2112 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 577 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4224 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 578 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2112 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 579 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 580 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 581 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 582 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 583 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW4_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 584 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 585 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 586 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1024 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 587 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 588 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 589 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 590 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 512 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 591 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1544 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 592 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB1_PGR1_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 520 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 593 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 1040 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 594 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 595 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR0_TT2_4_USFGRO1_VW1_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 596 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 597 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_AMAS1_EPS0_FL0_GRVW1_GSU8_LPB1_PGR0_PLR1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 598 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_AMAS1_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 599 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 600 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 601 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 602 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 603 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 604 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 605 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 606 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 607 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 608 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 609 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 610 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 611 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 612 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 613 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 614 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 615 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 616 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 617 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 618 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 619 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 620 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 621 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 622 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 623 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 624 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 625 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 626 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 + SubGroup0: 32 + SubGroup1: 2 + SubGroupA: 32 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 627 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 628 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 629 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id008 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id004 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 630 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 + SubGroup0: 32 + SubGroup1: 2 + SubGroupA: 32 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 631 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG08_04_08 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 632 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id008 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 633 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_02_04 + SubGroup0: 32 + SubGroup1: 2 + SubGroupA: 32 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id008 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id007 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4352 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 634 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 48 + LSCB: 16 + LSPA: 4 + LSPB: 12 + LVCA: 48 + LVCB: 16 + LVPA: 4 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3456 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 36 + MacroTileA: 48 + MacroTileB: 36 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 635 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x036x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT06_03_USFGRO01_VW01_WG08_12_02 + SubGroup0: 8 + SubGroup1: 12 + SubGroupA: 8 + SubGroupB: 12 + SuppresssNoLoadLoop: false + ThreadTile: [6, 3] + ThreadTile0: 6 + ThreadTile1: 3 + ThreadTileA: 6 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 12, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 12 + LSCB: 16 + LSPA: 16 + LSPB: 12 + LVCA: 12 + LVCB: 16 + LVPA: 16 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3392 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 36 + MacroTile1: 48 + MacroTileA: 36 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 636 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT036x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG12_16_01 + SubGroup0: 12 + SubGroup1: 16 + SubGroupA: 12 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [12, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 48 + MacroTile1: 48 + MacroTileA: 48 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 637 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id009 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 48 + LSCB: 32 + LSPA: 8 + LSPB: 12 + LVCA: 24 + LVCB: 16 + LVPA: 4 + LVPB: 6 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 24 + MacroTileA: 48 + MacroTileB: 24 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 638 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 + SubGroup0: 8 + SubGroup1: 6 + SubGroupA: 8 + SubGroupB: 6 + SuppresssNoLoadLoop: false + ThreadTile: *id011 + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id010 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 24 + LSCB: 32 + LSPA: 8 + LSPB: 6 + LVCA: 24 + LVCB: 32 + LVPA: 8 + LVPB: 6 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 24 + MacroTile1: 24 + MacroTileA: 24 + MacroTileB: 24 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 3 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 639 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT024x024x32_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_04_USFGRO01_VW01_WG08_06_04 + SubGroup0: 8 + SubGroup1: 6 + SubGroupA: 8 + SubGroupB: 6 + SuppresssNoLoadLoop: false + ThreadTile: [3, 4] + ThreadTile0: 3 + ThreadTile1: 4 + ThreadTileA: 3 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id010 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 48 + LSCB: 32 + LSPA: 8 + LSPB: 12 + LVCA: 24 + LVCB: 16 + LVPA: 4 + LVPB: 6 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 24 + MacroTileA: 48 + MacroTileB: 24 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 640 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 + SubGroup0: 8 + SubGroup1: 6 + SubGroupA: 8 + SubGroupB: 6 + SuppresssNoLoadLoop: false + ThreadTile: *id011 + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id010 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 641 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 642 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 643 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 644 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 645 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 646 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 647 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 648 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 649 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 650 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 651 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 652 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 653 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 654 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 655 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 656 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 657 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 658 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 384 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 6 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 659 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x24_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 660 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 661 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 662 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 663 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 664 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 665 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id025 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 666 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 667 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 668 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 669 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 670 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 671 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 672 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 673 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 674 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id025 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 675 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 676 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 677 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 678 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 679 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 680 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 681 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id023 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 682 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 683 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 684 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id024 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 685 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id025 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 686 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 687 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 688 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -89483,25 +108826,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 567 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 689 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id008 + ThreadTile: *id018 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id004 + WorkGroup: *id023 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -89515,15 +108858,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -89531,27 +108874,31 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4352 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89564,11 +108911,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 8 - MacroTileA: 128 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89576,20 +108923,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -89628,25 +108975,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 568 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 - SubGroup0: 32 - SubGroup1: 2 - SubGroupA: 32 - SubGroupB: 2 + SolutionIndex: 690 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id005 + ThreadTile: *id022 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id007 + VectorWidth: 2 + WorkGroup: *id023 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -89660,7 +109007,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89668,7 +109015,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -89676,37 +109023,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -89715,9 +109062,9 @@ LoopTail: true LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89725,13 +109072,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -89777,26 +109124,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 569 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG08_04_08 - SubGroup0: 8 + SolutionIndex: 691 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 SubGroup1: 4 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id001 - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -89809,16 +109156,165 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 692 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -89826,26 +109322,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 + LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89858,11 +109358,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89870,20 +109370,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -89922,17 +109422,17 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 570 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 693 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id008 - ThreadTile0: 2 + ThreadTile: *id022 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -89940,8 +109440,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: *id023 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -89954,15 +109454,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -89971,26 +109471,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4352 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90003,11 +109507,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 4 - MacroTileA: 64 - MacroTileB: 4 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90017,18 +109521,18 @@ NonTemporalC: 0 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -90067,25 +109571,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 571 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_02_04 - SubGroup0: 32 - SubGroup1: 2 - SubGroupA: 32 - SubGroupB: 2 + SolutionIndex: 694 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id008 + ThreadTile: *id018 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id007 + WorkGroup: *id023 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -90099,60 +109603,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 8 LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4352 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 12 MacroTile0: 64 - MacroTile1: 4 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90160,20 +109668,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -90212,25 +109720,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 572 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_02_08 - SubGroup0: 16 - SubGroup1: 2 - SubGroupA: 16 - SubGroupB: 2 + SolutionIndex: 695 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id003 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 + VectorWidth: 4 + WorkGroup: *id019 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -90244,7 +109752,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90252,39 +109760,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 48 - LSCB: 16 - LSPA: 4 - LSPB: 12 - LVCA: 48 - LVCB: 16 - LVPA: 4 - LVPB: 12 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3456 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90297,11 +109805,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 36 - MacroTileA: 48 - MacroTileB: 36 + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90309,15 +109817,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 192 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -90361,26 +109869,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 573 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x036x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT06_03_USFGRO01_VW01_WG08_12_02 + SolutionIndex: 696 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 - SubGroup1: 12 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 12 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [6, 3] - ThreadTile0: 6 - ThreadTile1: 3 - ThreadTileA: 6 - ThreadTileB: 3 + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 12, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id019 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -90393,7 +109901,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90401,7 +109909,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -90409,48 +109917,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 12 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 12 - LVCA: 12 - LVCB: 16 - LVPA: 16 - LVPB: 12 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 36 - MacroTile1: 48 - MacroTileA: 36 - MacroTileB: 48 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90458,15 +109966,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 192 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -90510,25 +110018,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 574 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT036x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG12_16_01 - SubGroup0: 12 - SubGroup1: 16 - SubGroupA: 12 - SubGroupB: 16 + SolutionIndex: 697 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [12, 16, 1] + VectorWidth: 2 + WorkGroup: *id026 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -90542,7 +110050,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -90550,7 +110058,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -90558,48 +110066,197 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 698 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 48 - MacroTile1: 48 - MacroTileA: 48 - MacroTileB: 48 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90607,14 +110264,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -90659,25 +110316,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 575 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG16_16_01 + SolutionIndex: 699 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id009 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: *id026 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -90716,39 +110373,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 48 + LSCA: 32 LSCB: 32 LSPA: 8 - LSPB: 12 - LVCA: 24 + LSPB: 8 + LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 6 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 24 - MacroTileA: 48 - MacroTileB: 24 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90756,15 +110413,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 3 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 192 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -90808,25 +110465,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 576 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 + SolutionIndex: 700 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 SubGroup0: 8 - SubGroup1: 6 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 6 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id011 - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id010 + WorkGroup: *id019 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -90848,39 +110505,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 24 + LSCA: 16 LSCB: 32 - LSPA: 8 - LSPB: 6 - LVCA: 24 - LVCB: 32 - LVPA: 8 - LVPB: 6 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90894,10 +110551,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 24 - MacroTile1: 24 - MacroTileA: 24 - MacroTileB: 24 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90905,15 +110562,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 3 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -90957,25 +110614,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 577 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT024x024x32_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_04_USFGRO01_VW01_WG08_06_04 + SolutionIndex: 701 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_04 SubGroup0: 8 - SubGroup1: 6 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 6 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [3, 4] - ThreadTile0: 3 - ThreadTile1: 4 - ThreadTileA: 3 - ThreadTileB: 4 + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id010 + VectorWidth: 2 + WorkGroup: *id023 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -90997,39 +110654,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 48 + LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 12 - LVCA: 24 - LVCB: 16 - LVPA: 4 - LVPB: 6 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91043,10 +110700,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 24 - MacroTileA: 48 - MacroTileB: 24 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91054,15 +110711,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -91106,85 +110763,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 578 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 + SolutionIndex: 702 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 - SubGroup1: 6 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 6 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id011 - ThreadTile0: 6 + ThreadTile: *id020 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id010 + VectorWidth: 4 + WorkGroup: *id023 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 - LSPA: 4 - LSPB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 2 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 832 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -91192,10 +110849,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91203,7 +110860,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 NumLoadsB: 1 @@ -91211,7 +110868,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -91255,46 +110912,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 579 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 703 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 + VectorWidth: 4 + WorkGroup: *id023 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -91304,46 +110961,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 8 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 LSPA: 8 LSPB: 16 - LVCA: 8 - LVCB: 4 + LVCA: 32 + LVCB: 16 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91354,13 +111011,13 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -91404,85 +111061,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 580 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 704 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 + ThreadTile: *id022 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 + WorkGroup: *id024 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 8 - LVCB: 2 - LVPA: 2 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -91491,9 +111148,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91501,15 +111158,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -91553,46 +111210,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 581 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 705 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -91602,36 +111259,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 8 - LSPA: 4 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 LSPB: 32 - LVCA: 16 - LVCB: 2 - LVPA: 1 + LVCA: 32 + LVCB: 8 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -91639,9 +111296,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91650,15 +111307,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -91702,48 +111359,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 582 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 706 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: *id027 ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + WorkGroup: *id026 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -91751,36 +111408,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCB: 32 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -91799,15 +111456,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -91851,85 +111508,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 583 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 707 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 + ThreadTile: *id022 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: *id023 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 32 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 LVCB: 8 - LVPA: 2 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -91937,10 +111594,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91948,15 +111605,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92000,85 +111657,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 584 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 708 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id020 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 - LVCA: 8 - LVCB: 4 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -92086,10 +111743,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92097,15 +111754,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92149,96 +111806,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 585 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 + SolutionIndex: 709 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id021 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: *id023 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 - LSPA: 4 - LSPB: 16 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92247,14 +111904,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92298,46 +111955,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 586 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 710 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id020 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: *id024 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -92347,47 +112004,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 LSPA: 8 LSPB: 32 - LVCA: 8 - LVCB: 2 + LVCA: 32 + LVCB: 8 LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92395,15 +112052,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92447,46 +112104,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 587 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 711 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id027 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: *id024 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -92496,36 +112153,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -92534,9 +112191,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92544,15 +112201,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92596,46 +112253,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 588 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 712 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id012 + ThreadTile: *id018 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + WorkGroup: *id026 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -92645,36 +112302,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 16 + LSCB: 32 + LSPA: 8 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -92683,9 +112340,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92693,15 +112350,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92745,75 +112402,75 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 589 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 713 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id022 ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + WorkGroup: *id026 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 32 - LVCB: 16 - LVPA: 2 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3200 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -92823,18 +112480,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92843,14 +112500,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -92894,48 +112551,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 590 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 714 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id020 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 4, 1] + VectorWidth: 4 + WorkGroup: *id023 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -92943,47 +112600,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92991,15 +112648,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -93043,96 +112700,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 591 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 715 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id012 + ThreadTile: *id018 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + WorkGroup: *id026 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 32 - LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 LVCB: 8 - LVPA: 2 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93140,15 +112797,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -93192,46 +112849,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 592 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 + SolutionIndex: 716 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id020 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id023 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -93241,46 +112898,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 8 - LVCB: 4 - LVPA: 2 - LVPB: 4 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -93289,15 +112946,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -93341,46 +112998,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 593 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SolutionIndex: 717 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 + ThreadTile: *id021 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 1 + WorkGroup: *id023 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -93390,36 +113047,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCB: 32 + LSPA: 8 LSPB: 16 LVCA: 32 - LVCB: 8 - LVPA: 2 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -93438,15 +113095,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -93490,75 +113147,75 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 594 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 718 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id022 ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: *id024 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 LVCA: 16 - LVCB: 4 - LVPA: 2 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3328 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -93568,18 +113225,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93587,15 +113244,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -93639,48 +113296,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 595 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_08_01 + SolutionIndex: 719 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id018 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 + VectorWidth: 2 + WorkGroup: *id026 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -93688,21 +113345,21 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 2 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3328 @@ -93717,18 +113374,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93736,15 +113393,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -93788,25 +113445,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 596 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 + SolutionIndex: 720 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 4 + ThreadTile: *id018 + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 + WorkGroup: *id026 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -93820,7 +113477,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -93828,39 +113485,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 + LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 16 - LVCB: 4 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 384 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -93873,11 +113526,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93885,20 +113538,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 6 - NumLoadsB: 3 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 6 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -93937,26 +113590,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 597 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x24_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 721 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -93969,7 +113622,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -93977,39 +113630,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -94022,11 +113671,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94034,20 +113683,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -94086,96 +113735,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 598 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 + SolutionIndex: 722 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94183,15 +113832,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -94235,48 +113884,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 599 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 723 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94284,47 +113933,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 32 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94332,14 +113981,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -94384,48 +114033,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 600 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_08_02 + SolutionIndex: 724 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: *id031 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id024 - WorkGroupMapping: 8 + WorkGroup: *id028 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94433,47 +114082,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3584 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94481,20 +114126,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -94533,96 +114178,92 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 601 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 725 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id032 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id019 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 + KernelLanguage: Source + LSCA: 128 + LSCB: 16 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94630,20 +114271,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -94682,35 +114323,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 602 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 726 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -94720,10 +114361,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94731,26 +114372,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly + KernelLanguage: Source LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6400 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -94760,18 +114401,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94779,8 +114420,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -94831,35 +114472,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 603 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 727 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 + ThreadTile: *id030 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id025 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -94869,10 +114510,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94880,47 +114521,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly + KernelLanguage: Source LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94928,8 +114565,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -94941,7 +114578,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -94980,35 +114617,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 604 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 728 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 + ThreadTile: *id029 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -95018,58 +114655,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 + KernelLanguage: Source + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95077,15 +114714,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -95129,35 +114766,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 605 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 729 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -95167,58 +114804,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 + KernelLanguage: Source + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 + LSPA: 16 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95226,15 +114863,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -95278,35 +114915,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 606 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 730 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 + ThreadTile: *id031 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -95316,58 +114953,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly + KernelLanguage: Source LSCA: 32 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 8 - LVCB: 4 - LVPA: 4 - LVPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95375,20 +115008,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -95427,35 +115060,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 607 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 731 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id032 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -95465,8 +115098,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -95476,47 +115109,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 + KernelLanguage: Source + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 32 - LVCA: 16 + LSPB: 64 + LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95524,15 +115157,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -95576,96 +115209,92 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 608 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 732 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id021 + ThreadTile: *id030 ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 1 + WorkGroup: *id028 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95673,20 +115302,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -95725,96 +115354,92 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 609 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 733 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id029 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95822,20 +115447,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -95874,96 +115499,92 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 610 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 734 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 + ThreadTile: *id031 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + KernelLanguage: Source + LSCA: 128 + LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 32 LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95971,20 +115592,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -96023,48 +115644,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 611 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SolutionIndex: 735 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 + VectorWidth: 4 + WorkGroup: *id028 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96072,47 +115693,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 16 + KernelLanguage: Source + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 16 - MacroTileA: 128 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96120,20 +115737,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -96172,48 +115789,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 612 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 736 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 + ThreadTile: *id029 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id025 + WorkGroup: *id028 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96221,47 +115838,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly + KernelLanguage: Source LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96269,14 +115886,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -96321,25 +115938,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 613 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 737 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 + ThreadTile: *id029 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: *id028 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -96353,7 +115970,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -96362,7 +115979,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96370,47 +115987,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 - LSPB: 16 + LSPB: 128 LVCA: 32 - LVCB: 16 + LVCB: 2 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96418,13 +116035,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -96470,26 +116087,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 614 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 738 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id021 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -96502,7 +116119,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -96510,45 +116127,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -96556,10 +116173,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96567,15 +116184,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -96619,25 +116236,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 615 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionIndex: 739 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -96657,7 +116274,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -96668,7 +116285,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -96676,39 +116293,35 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 8 + LSPB: 64 + LVCA: 16 LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96716,20 +116329,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -96768,14 +116381,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 616 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 740 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 + ThreadTile: *id035 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -96786,8 +116399,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -96817,7 +116430,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -96827,16 +116440,16 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 32 + LSPA: 16 + LSPB: 64 LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -96846,18 +116459,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96867,13 +116480,13 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -96917,17 +116530,17 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 617 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 + SolutionIndex: 741 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -96935,8 +116548,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -96955,58 +116568,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97014,20 +116623,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -97066,26 +116675,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 618 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 742 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -97106,56 +116715,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97163,14 +116772,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -97215,26 +116824,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 619 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SolutionIndex: 743 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id036 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -97256,7 +116865,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -97264,47 +116873,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97312,14 +116921,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -97364,26 +116973,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 620 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_08_02 + SolutionIndex: 744 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id038 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id024 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -97402,58 +117011,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97461,8 +117066,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -97474,7 +117079,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -97513,26 +117118,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 621 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SolutionIndex: 745 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -97562,7 +117167,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -97579,7 +117184,7 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 @@ -97591,14 +117196,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -97662,26 +117267,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 622 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + SolutionIndex: 746 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id027 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id024 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -97702,35 +117307,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6400 + LdsNumElements: 7680 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -97740,18 +117345,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 16 + MacroTile1: 96 MacroTileA: 128 - MacroTileB: 16 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97759,14 +117364,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -97811,25 +117416,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 623 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionIndex: 747 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id039 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id025 + VectorWidth: 2 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -97849,58 +117454,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97908,20 +117509,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -97960,25 +117561,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 624 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 748 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -98009,7 +117610,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -98017,39 +117618,39 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 - LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98057,15 +117658,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -98109,25 +117710,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 625 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionIndex: 749 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -98147,7 +117748,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -98158,7 +117759,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -98168,37 +117769,33 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 32 + LSPA: 16 + LSPB: 64 LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98208,18 +117805,18 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -98258,17 +117855,17 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 626 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 750 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 + ThreadTile: *id035 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -98276,8 +117873,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 1 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -98298,56 +117895,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98355,8 +117952,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -98407,26 +118004,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 627 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 751 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -98445,58 +118042,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98504,20 +118097,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -98556,26 +118149,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 628 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 752 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 + ThreadTile: *id036 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -98597,7 +118190,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -98605,47 +118198,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98653,14 +118246,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -98705,25 +118298,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 629 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 753 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id020 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id036 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -98745,19 +118338,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -98766,35 +118359,35 @@ LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 8 - LVCB: 4 - LVPA: 4 - LVPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98802,15 +118395,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -98854,25 +118447,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 630 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionIndex: 754 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id038 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id019 + VectorWidth: 2 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -98892,58 +118485,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98951,20 +118540,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -99003,25 +118592,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 631 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 755 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -99043,56 +118632,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99100,13 +118689,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -99152,26 +118741,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 632 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 756 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -99184,7 +118773,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -99192,56 +118781,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99249,15 +118838,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -99301,26 +118890,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 633 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id027 + SolutionIndex: 757 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id039 ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -99333,13 +118922,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -99350,7 +118939,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -99358,39 +118947,35 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 + LSCA: 128 + LSCB: 16 LSPA: 8 LSPB: 64 - LVCA: 16 - LVCB: 2 + LVCA: 32 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99398,20 +118983,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -99450,14 +119035,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 634 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 758 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id027 + ThreadTile: *id033 ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 @@ -99468,7 +119053,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id019 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -99482,7 +119067,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -99490,56 +119075,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99547,14 +119132,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -99599,26 +119184,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 635 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 759 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -99631,64 +119216,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 128 LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99696,20 +119277,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -99748,25 +119329,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 636 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 760 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -99780,7 +119361,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -99797,7 +119378,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -99806,38 +119387,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 128 LVCA: 32 - LVCB: 8 + LVCB: 2 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 24 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99845,13 +119426,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -99897,14 +119478,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 637 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 761 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id027 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 @@ -99915,7 +119496,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -99929,64 +119510,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99994,20 +119571,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -100046,26 +119623,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 638 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 762 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id019 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -100078,7 +119655,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -100086,56 +119663,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100143,13 +119720,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -100195,26 +119772,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 639 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 763 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -100233,7 +119810,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -100244,7 +119821,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -100252,39 +119829,35 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100292,20 +119865,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -100344,14 +119917,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 640 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 764 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 + ThreadTile: *id035 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -100362,7 +119935,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -100393,7 +119966,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -100412,7 +119985,7 @@ LdcEqualsLdd: false LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -100422,18 +119995,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100441,14 +120014,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -100493,17 +120066,17 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 641 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 + SolutionIndex: 765 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -100511,7 +120084,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -100531,58 +120104,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 32 LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCB: 8 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100590,20 +120159,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -100642,25 +120211,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 642 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 + SolutionIndex: 766 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id024 + VectorWidth: 4 + WorkGroup: *id034 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -100682,56 +120251,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100739,14 +120308,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -100791,26 +120360,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 643 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 767 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -100829,7 +120398,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -100840,7 +120409,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -100857,30 +120426,26 @@ LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100888,20 +120453,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -100940,14 +120505,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 644 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 + SolutionIndex: 768 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id027 + ThreadTile: *id033 ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 @@ -100958,8 +120523,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -100980,56 +120545,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101037,14 +120602,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -101089,26 +120654,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 645 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SolutionIndex: 769 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -101127,7 +120692,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -101138,7 +120703,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -101146,39 +120711,35 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101186,20 +120747,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -101238,14 +120799,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 646 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 770 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 + ThreadTile: *id035 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -101256,7 +120817,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -101287,7 +120848,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -101306,7 +120867,7 @@ LdcEqualsLdd: false LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -101316,18 +120877,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101335,14 +120896,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -101387,17 +120948,17 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 647 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 + SolutionIndex: 771 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id035 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -101405,7 +120966,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id023 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -101425,7 +120986,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -101436,7 +120997,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -101444,39 +121005,35 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 16 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101484,20 +121041,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -101536,17 +121093,17 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 648 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_08_02 + SolutionIndex: 772 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 + ThreadTile: *id037 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -101554,7 +121111,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id024 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -101585,7 +121142,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -101602,7 +121159,7 @@ LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 14336 LdsNumElementsAlignedA: 4096 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 @@ -101614,14 +121171,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -101685,25 +121242,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 649 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 + SolutionIndex: 773 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id027 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id037 ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id024 + WorkGroup: *id034 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -101723,58 +121280,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 4096 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101782,20 +121335,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -101834,26 +121387,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 650 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 774 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -101874,56 +121427,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 LSPA: 8 - LSPB: 8 + LSPB: 32 LVCA: 32 - LVCB: 32 - LVPA: 4 + LVCB: 8 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101931,14 +121484,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -101983,96 +121536,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 651 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 + SolutionIndex: 775 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id034 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelLanguage: Source + LSCA: 16 + LSCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102081,14 +121634,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -102132,47 +121685,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 652 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SolutionIndex: 776 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id023 + VectorWidth: 2 + WorkGroup: *id040 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 2 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -102180,48 +121733,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Assembly + KernelLanguage: Source LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCB: 2 + LSPA: 2 + LSPB: 32 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 2 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102229,15 +121782,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -102281,26 +121834,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 653 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 777 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x02_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id040 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -102313,7 +121866,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102321,39 +121874,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -102366,11 +121919,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102378,7 +121931,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -102430,26 +121983,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 654 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 + SolutionIndex: 778 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT02_02_USFGRO01_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id020 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppresssNoLoadLoop: true + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id044 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -102462,7 +122015,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102470,56 +122023,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCA: 16 + LSCB: 4 + LSPA: 4 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102527,15 +122080,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -102579,26 +122132,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 655 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 + SolutionIndex: 779 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id021 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id023 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id042 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -102611,7 +122164,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102619,8 +122172,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -102631,44 +122184,44 @@ GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 8 + LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 512 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102676,15 +122229,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -102728,26 +122281,26 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 656 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SolutionIndex: 780 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id022 - ThreadTile0: 4 + ThreadTile: *id041 + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id024 - WorkGroupMapping: 8 + WorkGroup: *id043 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -102760,7 +122313,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102768,7 +122321,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -102780,44 +122333,44 @@ GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 + LSCA: 16 + LSCB: 4 + LSPA: 4 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102825,15 +122378,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -102877,25 +122430,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 657 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 781 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id018 + ThreadTile: *id041 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 + WorkGroup: *id042 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -102909,7 +122462,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102917,7 +122470,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -102930,26 +122483,26 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 + LSCA: 8 + LSCB: 8 + LSPA: 8 LSPB: 8 - LVCA: 16 - LVCB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -102962,10 +122515,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 2 + MacroTile0: 8 MacroTile1: 8 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -102976,13 +122529,13 @@ NonTemporalC: 0 NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -103026,35 +122579,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 658 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 - SubGroup0: 16 + SolutionIndex: 782 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 + SubGroup0: 4 SubGroup1: 4 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id018 + ThreadTile: *id041 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: *id043 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -103064,54 +122617,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 + KernelLanguage: Assembly + LSCA: 16 LSCB: 8 LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103119,20 +122676,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103171,92 +122728,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 659 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SolutionIndex: 783 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103264,20 +122825,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103316,35 +122877,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 660 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SolutionIndex: 784 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 2 + WorkGroup: *id043 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -103354,58 +122915,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 16 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 16 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 16 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103413,8 +122974,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -103465,48 +123026,48 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 661 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SolutionIndex: 785 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: *id041 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 2 + WorkGroup: *id044 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103519,27 +123080,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 3360 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -103550,11 +123111,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103562,14 +123123,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -103614,77 +123175,81 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 662 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id031 + SolutionIndex: 786 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id045 ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id046 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 16 - LSPA: 16 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3360 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -103695,11 +123260,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103707,20 +123272,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103759,35 +123324,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 663 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id032 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SolutionIndex: 787 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id045 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id028 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: *id046 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -103797,8 +123362,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -103815,21 +123380,25 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 + KernelLanguage: Assembly + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -103841,9 +123410,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -103852,20 +123421,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103904,35 +123473,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 664 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 788 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 - ThreadTile0: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id045 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -103942,8 +123511,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -103960,7 +123529,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 LSCB: 16 LSPA: 8 @@ -103970,15 +123539,15 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104053,35 +123622,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 665 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SolutionIndex: 789 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG32_08_01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id046 WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -104092,38 +123661,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 32 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 2592 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104135,10 +123704,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104147,20 +123716,20 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104198,35 +123767,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 666 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SolutionIndex: 790 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x128x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG08_32_01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppresssNoLoadLoop: false - ThreadTile: *id029 + ThreadTile: *id047 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 1 + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -104237,42 +123806,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSPA: 2 + LSPB: 16 + LVCA: 128 + LVCB: 16 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2592 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104284,10 +123849,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104296,20 +123861,20 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104347,96 +123912,92 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 667 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SolutionIndex: 791 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW02_WG32_08_01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id029 + ThreadTile: *id047 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 2 + WorkGroup: *id048 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 16 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 256 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSPA: 1 + LSPB: 16 + LVCA: 256 + LVCB: 16 + LVPA: 1 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4640 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 4096 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104445,20 +124006,20 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104496,35 +124057,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 668 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SolutionIndex: 792 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x16_DTL1_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT08_04_USFGRO01_VW02_WG32_08_01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id031 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: *id050 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 2 + WorkGroup: *id048 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -104535,38 +124096,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 + KernelLanguage: Assembly + LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 1600 LdsOffsetA: 0 - LdsOffsetB: 1536 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104578,10 +124139,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104589,21 +124150,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104641,35 +124202,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 669 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 + SolutionIndex: 793 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG16_08_01 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id032 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + ThreadTile: *id047 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id028 + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -104680,42 +124241,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 + KernelLanguage: Assembly + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104727,9 +124284,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -104738,21 +124295,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104790,46 +124347,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 670 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 794 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id030 - ThreadTile0: 8 + ThreadTile: *id047 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id049 WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -104846,21 +124403,21 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104871,7 +124428,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -104885,19 +124442,19 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104935,46 +124492,46 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 671 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 795 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id029 + ThreadTile: *id047 ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 1 + WorkGroup: *id049 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -104991,21 +124548,21 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 3136 LdsOffsetA: 0 LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105016,11 +124573,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105031,18 +124588,18 @@ NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -105080,75 +124637,77 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 672 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 796 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: *id031 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: *id050 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 1 + WorkGroup: *id049 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 - LSCB: 32 - LSPA: 8 + LSCB: 8 + LSPA: 2 LSPB: 32 - LVCA: 32 + LVCA: 128 LVCB: 8 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -105161,7 +124720,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -105174,9 +124733,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 @@ -105186,7 +124743,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105225,73 +124782,72 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 673 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 797 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW01_GSU08_PGR1_PLR1_TT08_04_USFGRO01_VW01_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id030 + ThreadTile: *id051 ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 + VectorWidth: 1 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false - DirectToLdsA: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 1 LSPB: 32 - LVCA: 16 + LVCA: 256 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 1 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 2304 LdsOffsetA: 0 LdsOffsetB: 2048 LdsPadA: 0 @@ -105302,15 +124858,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105318,14 +124874,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -105370,79 +124924,78 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 674 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 + SolutionIndex: 798 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT08_04_USFGRO01_VW04_WG32_08_01_WGM08 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id051 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id028 + WorkGroup: *id053 WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -105455,10 +125008,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -105467,14 +125020,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -105519,31 +125070,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 675 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 + SolutionIndex: 799 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW02_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id029 - ThreadTile0: 4 + ThreadTile: *id051 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id028 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id052 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -105557,37 +125108,36 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -105606,9 +125156,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105616,13 +125166,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -105668,31 +125216,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 676 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 + SolutionIndex: 800 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW02_WG32_08_01_WGM01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id054 + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id034 + VectorWidth: 2 + WorkGroup: *id053 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -105706,8 +125254,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -105725,22 +125272,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 128 - LVCA: 32 + LSPB: 64 + LVCA: 16 LVCB: 2 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -105754,10 +125301,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105765,15 +125312,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -105817,17 +125362,16 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 677 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + SolutionIndex: 801 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 + SubGroupB: 8 + ThreadTile: *id054 + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -105835,13 +125379,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: *id055 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -105849,14 +125394,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -105875,17 +125419,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 64 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 2 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -105898,7 +125446,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -105910,20 +125458,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105962,31 +125508,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 678 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + SolutionIndex: 802 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM08 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 + SubGroupB: 8 + ThreadTile: *id054 ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: *id055 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -106000,8 +125546,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106028,13 +125573,9 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -106049,9 +125590,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106059,20 +125600,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106111,17 +125650,16 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 679 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 - ThreadTile0: 4 + SolutionIndex: 803 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: *id051 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -106129,13 +125667,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [8, 32, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -106149,8 +125688,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106177,9 +125715,13 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -106194,9 +125736,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106204,20 +125746,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106256,31 +125796,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 680 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + SolutionIndex: 804 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id036 + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -106294,8 +125834,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106322,13 +125861,9 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -106355,8 +125890,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -106366,7 +125899,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106405,14 +125938,13 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 681 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + SolutionIndex: 805 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id036 + ThreadTile: *id054 ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 @@ -106423,162 +125955,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 - WorkGroupMappingType: B - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 16 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesSummation: 1 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: false - UseBeta: true - UseInitialStrides: false - SolutionIndex: 682 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id038 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -106592,8 +125976,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106649,8 +126032,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -106699,14 +126080,13 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 683 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + SolutionIndex: 806 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 + ThreadTile: *id051 ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -106717,13 +126097,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -106737,8 +126118,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106798,8 +126178,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -106848,14 +126226,13 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 684 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + SolutionIndex: 807 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 + ThreadTile: *id051 ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -106866,13 +126243,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: *id052 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -106886,37 +126264,36 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -106935,9 +126312,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106945,14 +126322,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -106997,31 +126372,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 685 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id039 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SolutionIndex: 808 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id054 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 + VectorWidth: 4 + WorkGroup: *id053 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -107035,8 +126410,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -107063,9 +126437,13 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107080,9 +126458,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107090,20 +126468,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -107142,17 +126518,16 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 686 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 + SolutionIndex: 809 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM08 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id054 + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -107160,30 +126535,32 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: *id053 + WorkGroupMapping: 8 WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -107196,25 +126573,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107227,11 +126604,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107239,15 +126616,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -107291,33 +126671,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 687 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 810 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -107329,8 +126719,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -107345,21 +126735,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107373,10 +126767,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107384,20 +126778,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -107436,48 +126833,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 688 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 811 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -107490,21 +126897,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -107521,10 +126928,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -107533,15 +126940,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -107585,17 +126995,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 689 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 812 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -107603,15 +127020,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -107623,8 +127043,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -107639,21 +127059,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107667,9 +127091,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -107678,20 +127102,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -107730,17 +127157,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 690 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 813 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id036 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -107748,15 +127182,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -107769,7 +127206,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -107784,25 +127221,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107816,9 +127253,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -107827,15 +127264,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -107879,17 +127319,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 691 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 814 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id036 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -107897,28 +127344,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -107933,31 +127383,31 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -107965,10 +127415,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107976,15 +127426,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108028,48 +127481,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 692 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 815 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW2_GSU1_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id038 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -108082,23 +127545,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -108109,7 +127576,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -108123,18 +127590,21 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -108173,14 +127643,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 693 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 816 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -108191,15 +127668,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -108212,7 +127692,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -108227,42 +127707,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 16 + LVCA: 8 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108270,15 +127750,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108322,17 +127805,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 694 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 817 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -108340,15 +127830,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [8, 4, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -108361,42 +127854,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 16 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 + LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -108409,9 +127902,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108419,15 +127912,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108471,33 +127967,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 695 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 818 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id039 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -108509,10 +128015,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -108525,38 +128031,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108564,20 +128074,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -108616,33 +128129,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 696 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 819 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -108655,7 +128178,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -108670,42 +128193,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108713,15 +128236,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -108765,17 +128291,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 697 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 820 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -108783,28 +128316,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -108819,38 +128355,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 + LSCA: 32 + LSCB: 32 LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 + LSPB: 8 + LVCA: 8 + LVCB: 8 LVPA: 2 - LVPB: 32 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetB: 3072 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108858,20 +128398,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -108910,46 +128453,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 698 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 821 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [8, 4, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -108964,42 +128517,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109007,15 +128560,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -109059,46 +128615,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 699 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 822 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -109113,23 +128679,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 + LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -109140,11 +128710,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109152,20 +128722,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -109204,48 +128777,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 700 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 823 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -109258,42 +128841,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109301,15 +128884,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -109353,46 +128939,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 701 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 824 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -109407,34 +129003,38 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -109448,18 +129048,21 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -109498,48 +129101,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 702 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 825 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -109552,42 +129165,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCA: 128 + LSCB: 16 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109597,13 +129210,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -109647,17 +129263,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 703 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 826 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 - ThreadTile0: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -109665,15 +129288,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -109685,8 +129311,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -109701,38 +129327,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109740,20 +129370,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -109792,17 +129425,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 704 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 827 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -109810,28 +129450,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -109846,27 +129489,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 16 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -109877,7 +129520,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -109891,13 +129534,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -109941,14 +129587,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 705 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 828 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -109959,30 +129612,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -109995,38 +129651,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110034,20 +129694,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -110086,46 +129749,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 706 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 829 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -110140,42 +129813,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110183,15 +129856,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -110235,17 +129911,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 707 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 830 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -110253,30 +129936,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -110289,38 +129975,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCA: 128 + LSCB: 16 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110330,18 +130020,21 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -110380,17 +130073,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 708 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 831 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id035 - ThreadTile0: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -110398,15 +130098,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -110419,7 +130122,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -110434,7 +130137,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 @@ -110446,30 +130149,30 @@ LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7296 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110477,15 +130180,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -110529,14 +130235,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 709 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 832 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id035 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -110547,28 +130260,31 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -110583,23 +130299,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 16 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6144 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -110610,7 +130330,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -110624,18 +130344,21 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -110674,14 +130397,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 710 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 833 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id037 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 @@ -110692,30 +130422,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -110728,42 +130461,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110771,15 +130504,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -110823,17 +130559,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 711 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 834 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id037 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -110841,15 +130584,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -110861,8 +130607,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -110877,38 +130623,42 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110916,20 +130666,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -110968,48 +130721,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 712 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 835 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -111022,25 +130785,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 + LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111053,11 +130816,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111065,15 +130828,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111094,6 +130860,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111103,6 +130870,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111117,79 +130885,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 713 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 836 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id033 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id034 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 4 - LSPA: 4 - LSPB: 16 - LVCA: 16 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111202,11 +130980,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111214,15 +130992,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111243,6 +131024,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111252,6 +131034,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111266,48 +131049,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 714 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 837 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id040 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -111319,26 +131112,26 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 2 - LSPA: 2 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111351,10 +131144,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 256 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 256 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -111363,15 +131156,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111392,6 +131188,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111401,6 +131198,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111415,96 +131213,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 715 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x02_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [4, 4] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 838 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x32x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG64_4_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 64 + SubGroup1: 4 + SubGroupA: 64 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id040 - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111512,15 +131320,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111541,6 +131352,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111550,6 +131362,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111564,79 +131377,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 716 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 839 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS1_PGR1_SNLL1_TT4_4_WG32_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id044 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 4 - LSPA: 4 - LSPB: 16 - LVCA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111649,11 +131472,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111661,15 +131484,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111690,6 +131516,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111699,6 +131526,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111713,33 +131541,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 717 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 840 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM7 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id042 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 7 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -111752,57 +131590,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111810,15 +131648,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111839,6 +131680,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111848,6 +131690,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -111862,79 +131705,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 718 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 841 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM15 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id043 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 15 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 4 - LSPA: 4 - LSPB: 16 - LVCA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -111947,11 +131800,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111959,15 +131812,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -111988,6 +131844,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -111997,6 +131854,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112011,33 +131869,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 719 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 842 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id042 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -112050,57 +131918,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112108,15 +131976,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -112137,6 +132008,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112146,6 +132018,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112160,33 +132033,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 720 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 843 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM17 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id043 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 17 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -112198,57 +132081,53 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 256 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 256 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 256 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -112257,20 +132136,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -112286,6 +132168,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112295,6 +132178,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112309,33 +132193,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 721 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 844 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM17 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 64 + SubGroup1: 4 + SubGroupA: 64 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 17 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -112348,57 +132242,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112406,15 +132300,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -112435,6 +132332,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112444,6 +132342,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112458,95 +132357,101 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 722 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 845 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id043 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 256 + LSCB: 8 + LSPA: 4 LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 256 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 256 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -112555,20 +132460,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -112584,6 +132492,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112593,6 +132502,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112607,48 +132517,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 723 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id041 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 846 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 64 + SubGroup1: 4 + SubGroupA: 64 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id044 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -112664,24 +132584,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3360 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112692,11 +132612,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112706,13 +132626,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -112733,6 +132656,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112742,6 +132666,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112756,14 +132681,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 724 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id045 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 847 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -112774,30 +132706,33 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id046 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -112813,28 +132748,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3360 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -112842,10 +132777,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112855,13 +132790,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -112882,6 +132820,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -112891,6 +132830,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -112905,33 +132845,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 725 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 - SubGroup0: 32 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 848 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM2 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id045 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id046 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 2 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -112944,7 +132894,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -112959,7 +132909,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 @@ -112971,15 +132921,15 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -112992,9 +132942,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113002,15 +132952,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -113031,6 +132984,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113040,6 +132994,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113054,18 +133009,25 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 726 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 849 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_16_1_WGM7 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id045 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -113073,14 +133035,17 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 7 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -113093,7 +133058,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -113108,41 +133073,41 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -113151,15 +133116,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -113180,6 +133148,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113189,6 +133158,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113203,13 +133173,20 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 727 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG32_08_01 - SubGroup0: 32 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 850 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM7 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: true + SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 @@ -113221,15 +133198,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id046 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 7 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -113241,54 +133221,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2592 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113297,20 +133281,23 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -113325,6 +133312,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113334,6 +133322,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113348,33 +133337,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 728 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x128x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG08_32_01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppresssNoLoadLoop: false - ThreadTile: *id047 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 851 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM11 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 11 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -113386,39 +133385,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 2 - LSPB: 16 - LVCA: 128 - LVCB: 16 - LVPA: 2 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2592 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -113430,10 +133433,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113442,20 +133445,23 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -113470,6 +133476,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113479,6 +133486,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113493,92 +133501,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 729 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW02_WG32_08_01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id047 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 852 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM16 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id048 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 16 - DirectToLds: true - DirectToLdsA: true + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 + LSCA: 64 LSCB: 16 - LSPA: 1 - LSPB: 16 - LVCA: 256 - LVCB: 16 - LVPA: 1 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4640 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 4096 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113586,21 +133608,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -113615,6 +133640,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113624,6 +133650,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113638,33 +133665,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 730 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x16_DTL1_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT08_04_USFGRO01_VW02_WG32_08_01 - SubGroup0: 32 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 853 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id050 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id048 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -113676,50 +133713,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1600 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -113731,21 +133772,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -113760,6 +133804,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113769,6 +133814,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113783,81 +133829,95 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 731 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG16_08_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 854 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_8_2_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id047 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -113878,19 +133938,22 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -113905,6 +133968,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -113914,6 +133978,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -113928,48 +133993,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 732 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 855 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id047 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id049 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 32 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -113982,23 +134057,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114009,10 +134088,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -114021,21 +134100,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -114050,6 +134132,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114059,6 +134142,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114073,17 +134157,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 733 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 856 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id047 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -114091,15 +134182,18 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id049 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -114111,8 +134205,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -114127,23 +134221,27 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -114155,9 +134253,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -114166,21 +134264,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -114195,6 +134296,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114204,6 +134306,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114218,17 +134321,24 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 734 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 857 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id050 - ThreadTile0: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -114236,59 +134346,64 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id049 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: false + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114301,7 +134416,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -114314,12 +134429,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -114340,6 +134460,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114349,6 +134470,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114363,74 +134485,89 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 735 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW01_GSU08_PGR1_PLR1_TT08_04_USFGRO01_VW01_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 858 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id052 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true - DirectToLdsA: true + DepthU: 16 + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 - LSCB: 8 - LSPA: 1 - LSPB: 32 - LVCA: 256 - LVCB: 8 - LVPA: 1 + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 32 + LVCB: 4 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114439,15 +134576,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -114459,14 +134596,19 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -114482,6 +134624,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114491,6 +134634,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114505,74 +134649,85 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 736 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT08_04_USFGRO01_VW04_WG32_08_01_WGM08 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id051 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 859 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 32 + SubGroupA: 16 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id053 - WorkGroupMapping: 8 + WorkGroup: [16, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 64 + LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -114589,10 +134744,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -114601,13 +134756,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -114628,6 +134788,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114637,6 +134798,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114651,33 +134813,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 737 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW02_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 860 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 - ThreadTile0: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id052 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -114689,8 +134861,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -114698,20 +134871,20 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -114748,12 +134921,19 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -114765,6 +134945,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -114774,6 +134955,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114783,6 +134965,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114797,33 +134980,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 738 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW02_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 861 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id053 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -114835,9 +135026,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -114850,25 +135042,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 16 - LVCB: 2 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114882,9 +135074,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -114899,7 +135091,12 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -114911,6 +135108,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -114920,6 +135118,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -114929,6 +135128,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -114943,33 +135143,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 739 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 862 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id055 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -114981,9 +135191,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -114996,25 +135207,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 16 - LVCB: 2 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115028,9 +135239,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -115041,11 +135252,18 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -115057,6 +135275,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -115066,6 +135285,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115075,6 +135295,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115089,47 +135310,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 740 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM08 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 863 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id055 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -115142,21 +135372,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115169,11 +135403,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115186,19 +135420,25 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -115208,6 +135448,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115217,6 +135458,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115231,33 +135473,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 741 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: *id051 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 864 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -115269,7 +135521,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -115284,25 +135537,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115316,10 +135569,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115327,13 +135580,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -115345,6 +135605,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -115354,6 +135615,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115363,6 +135625,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115377,33 +135640,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 742 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 865 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id052 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -115415,6 +135686,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -115430,21 +135702,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115458,9 +135734,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -115469,24 +135745,30 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -115496,6 +135778,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115505,6 +135788,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115519,33 +135803,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 743 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 866 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id054 - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id052 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -115557,7 +135851,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -115572,7 +135867,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 @@ -115584,9 +135879,13 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115601,9 +135900,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115611,24 +135910,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -115638,6 +135945,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115647,6 +135955,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115661,33 +135970,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 744 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 867 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id052 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 2 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -115699,6 +136016,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -115714,7 +136032,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 @@ -115726,9 +136044,9 @@ LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -115747,9 +136065,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -115757,13 +136075,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -115775,6 +136098,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -115784,6 +136108,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115793,6 +136118,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115807,80 +136133,91 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 745 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 868 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: *id051 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id052 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 32 + LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3344 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -115891,10 +136228,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -115903,13 +136240,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -115921,6 +136265,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -115930,6 +136275,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -115939,6 +136285,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -115953,33 +136300,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 746 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id054 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 869 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id053 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + _staggerStrideShift: 3 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -115991,42 +136346,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116038,9 +136390,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -116049,24 +136401,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -116076,6 +136436,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116085,6 +136446,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116099,28 +136461,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 747 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM08 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id054 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 870 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id053 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116132,49 +136501,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116185,10 +136550,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -116197,12 +136562,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -116213,13 +136580,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -116229,6 +136597,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116238,6 +136607,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116256,8 +136626,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 748 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + SolutionIndex: 871 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116265,24 +136635,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116294,7 +136662,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -116302,41 +136670,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 + LSCA: 96 + LSCB: 8 + LSPA: 5 LSPB: 64 - LVCA: 32 + LVCA: 48 LVCB: 4 - LVPA: 2 - LVPB: 16 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3344 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116347,11 +136715,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116359,12 +136727,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -116382,6 +136750,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -116391,6 +136760,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116400,6 +136770,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116418,8 +136789,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 749 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM4 + SolutionIndex: 872 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116428,22 +136799,22 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -116456,15 +136827,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -116472,33 +136843,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6688 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116509,10 +136880,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -116521,12 +136892,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -116538,12 +136911,13 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -116553,6 +136927,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116562,6 +136937,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116580,8 +136956,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 750 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 873 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116590,23 +136966,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116625,42 +136999,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6688 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1600 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116672,10 +137046,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116683,12 +137057,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -116700,12 +137076,13 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -116715,6 +137092,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116724,6 +137102,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116742,8 +137121,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 751 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 874 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116752,23 +137131,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116788,41 +137165,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6688 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1600 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -116834,10 +137211,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -116845,12 +137222,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -116862,12 +137239,13 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -116877,6 +137255,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -116886,6 +137265,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -116904,8 +137284,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 752 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM32 + SolutionIndex: 875 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116914,19 +137294,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -116942,15 +137322,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -116958,48 +137338,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 16 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117007,12 +137387,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -117030,6 +137412,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -117039,6 +137422,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117048,6 +137432,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117066,33 +137451,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 753 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW2_GSU1_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_2_WGM4 + SolutionIndex: 876 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117192,6 +137575,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -117201,6 +137585,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117210,6 +137595,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117228,8 +137614,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 754 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + SolutionIndex: 877 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117244,13 +137630,13 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -117266,16 +137652,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -117291,28 +137677,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 8 + LSPB: 64 + LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -117320,10 +137706,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117331,13 +137717,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -117354,6 +137742,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -117363,6 +137752,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117372,6 +137762,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117390,33 +137781,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 755 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + SolutionIndex: 878 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 4, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117428,7 +137817,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -117437,7 +137826,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -117454,21 +137843,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -117481,7 +137870,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -117497,7 +137886,7 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -117516,6 +137905,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -117525,6 +137915,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117534,6 +137925,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117552,8 +137944,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 756 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 + SolutionIndex: 879 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117568,15 +137960,15 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -117596,10 +137988,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -117618,36 +138010,32 @@ LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117655,8 +138043,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -117671,13 +138061,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -117687,6 +138078,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117696,6 +138088,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117714,33 +138107,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 757 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + SolutionIndex: 880 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117759,7 +138150,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -117777,38 +138168,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -117817,11 +138208,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -117840,6 +138233,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -117849,6 +138243,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -117858,6 +138253,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -117876,33 +138272,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 758 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM4 + SolutionIndex: 881 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117914,7 +138308,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -117939,28 +138333,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 128 + LSCB: 16 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 640 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -117968,10 +138362,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -117979,13 +138373,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118002,6 +138396,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -118011,6 +138406,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118020,6 +138416,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118038,31 +138435,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 759 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 + SolutionIndex: 882 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 4, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -118076,14 +138473,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -118102,27 +138499,23 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -118131,9 +138524,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118141,11 +138534,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -118157,13 +138552,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -118173,6 +138569,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118182,6 +138579,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118200,33 +138598,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 760 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 + SolutionIndex: 883 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118245,7 +138641,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -118305,6 +138701,8 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -118326,6 +138724,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -118335,6 +138734,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118344,6 +138744,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118362,8 +138763,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 761 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 884 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118378,7 +138779,7 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -118387,8 +138788,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118409,7 +138808,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -118425,39 +138824,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118465,11 +138864,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -118488,6 +138887,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -118497,6 +138897,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118506,6 +138907,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118524,28 +138926,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 762 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 885 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -118562,53 +138964,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -118617,9 +139015,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118628,12 +139026,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118643,13 +139043,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -118659,6 +139060,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118668,6 +139070,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118686,8 +139089,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 763 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM8 + SolutionIndex: 886 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118695,24 +139098,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118724,53 +139125,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 576 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -118778,10 +139175,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118789,13 +139186,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118805,13 +139204,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -118821,6 +139221,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118830,6 +139231,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -118848,8 +139250,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 764 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 887 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118857,24 +139259,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118886,64 +139286,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -118952,12 +139348,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -118967,13 +139365,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -118983,6 +139382,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -118992,6 +139392,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119010,8 +139411,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 765 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 888 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119019,24 +139420,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119054,43 +139453,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 LVPA: 2 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -119102,10 +139497,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119113,13 +139508,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119129,13 +139526,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -119145,6 +139543,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119154,6 +139553,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119172,33 +139572,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 766 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 889 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119216,58 +139614,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119276,12 +139670,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119291,13 +139687,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -119307,6 +139704,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119316,6 +139714,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119334,8 +139733,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 767 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 890 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119343,24 +139742,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119372,16 +139769,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -119398,27 +139795,27 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -119427,9 +139824,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119439,11 +139836,13 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119460,6 +139859,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -119469,6 +139869,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119478,6 +139879,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119496,8 +139898,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 768 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + SolutionIndex: 891 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119506,23 +139908,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119534,7 +139934,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -119559,28 +139959,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 8 LSPB: 32 - LVCA: 32 - LVCB: 8 + LVCA: 16 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -119588,9 +139988,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -119603,9 +140003,9 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119622,6 +140022,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -119631,6 +140032,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119640,6 +140042,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119658,8 +140061,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 769 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 892 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119668,10 +140071,10 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -119679,10 +140082,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -119696,14 +140099,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -119722,38 +140125,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LVCB: 2 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119761,13 +140164,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119784,6 +140189,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -119793,6 +140199,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119802,6 +140209,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119820,8 +140228,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 770 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 893 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119830,23 +140238,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119858,7 +140264,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -119867,7 +140273,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -119883,24 +140289,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 16 LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -119911,11 +140317,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119923,13 +140329,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -119946,6 +140352,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -119955,6 +140362,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -119964,6 +140372,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -119982,20 +140391,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 771 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM32 + SolutionIndex: 894 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -120003,10 +140412,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -120026,10 +140435,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -120047,33 +140456,29 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 32 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -120085,13 +140490,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -120101,13 +140508,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -120117,6 +140525,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120126,6 +140535,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120144,8 +140554,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 772 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + SolutionIndex: 895 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120153,7 +140563,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -120165,12 +140575,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120182,13 +140590,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -120208,27 +140616,23 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCB: 16 + LSPA: 8 LSPB: 32 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 4 + LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -120247,13 +140651,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -120263,13 +140667,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -120279,6 +140684,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 0 @@ -120288,6 +140694,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -120306,8 +140713,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 773 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 + SolutionIndex: 896 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL1_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120315,7 +140722,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -120327,10 +140734,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -120350,43 +140757,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1032 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -120398,9 +140801,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -120409,12 +140812,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -120425,13 +140830,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -120451,8 +140857,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -120470,8 +140876,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 774 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM2 + SolutionIndex: 897 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120479,24 +140885,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 2 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120514,43 +140918,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1032 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -120562,9 +140962,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -120573,12 +140973,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -120589,13 +140991,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -120615,8 +141018,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -120634,33 +141037,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 775 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM2 + SolutionIndex: 898 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120679,42 +141080,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 + LSCA: 64 LSCB: 8 LSPA: 4 LSPB: 32 - LVCA: 64 - LVCB: 4 - LVPA: 1 - LVPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3080 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -120726,10 +141127,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120737,12 +141138,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -120760,6 +141163,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -120779,8 +141183,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -120798,33 +141202,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 776 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x32x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG64_4_1_WGM2 + SolutionIndex: 899 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 64 - SubGroup1: 4 - SubGroupA: 64 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120842,43 +141244,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 776 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -120890,9 +141288,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -120902,12 +141300,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -120917,13 +141317,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -120943,8 +141344,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -120962,33 +141363,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 777 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS1_PGR1_SNLL1_TT4_4_WG32_8_1_WGM4 + SolutionIndex: 900 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121006,43 +141405,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 648 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -121054,10 +141449,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121065,13 +141460,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -121081,13 +141478,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -121107,8 +141505,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121126,33 +141524,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 778 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM7 + SolutionIndex: 901 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 7 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121170,43 +141566,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 648 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -121218,10 +141610,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121229,13 +141621,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -121245,13 +141639,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -121271,8 +141666,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121290,33 +141685,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 779 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM15 + SolutionIndex: 902 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 15 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121335,42 +141728,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3080 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -121382,9 +141775,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -121393,12 +141786,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -121416,6 +141811,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -121435,8 +141831,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121454,8 +141850,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 780 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM16 + SolutionIndex: 903 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -121464,23 +141860,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121498,43 +141892,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 776 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -121546,10 +141936,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121557,13 +141947,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -121573,13 +141965,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -121599,8 +141992,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121618,33 +142011,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 781 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM17 + SolutionIndex: 904 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 17 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121663,38 +142054,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 + LSCA: 64 LSCB: 8 - LSPA: 4 + LSPA: 2 LSPB: 16 LVCA: 64 - LVCB: 4 - LVPA: 1 - LVPB: 8 + LVCB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2176 + LdsNumElements: 648 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -121706,9 +142097,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 256 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 256 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -121717,13 +142108,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -121740,6 +142133,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -121759,8 +142153,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121778,33 +142172,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 782 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM17 + SolutionIndex: 905 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 64 - SubGroup1: 4 - SubGroupA: 64 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 17 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121816,49 +142208,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2064 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -121869,10 +142257,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -121881,12 +142269,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -121897,13 +142287,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -121923,8 +142314,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -121942,33 +142333,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 783 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM32 + SolutionIndex: 906 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121980,45 +142369,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 256 - LSCB: 8 + LSCA: 64 + LSCB: 16 LSPA: 4 LSPB: 16 LVCA: 64 - LVCB: 4 - LVPA: 1 - LVPB: 8 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2176 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -122029,11 +142418,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 256 - MacroTile1: 16 - MacroTileA: 256 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122041,12 +142430,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -122058,12 +142449,13 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -122083,8 +142475,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122102,33 +142494,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 784 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM32 + SolutionIndex: 907 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 64 - SubGroup1: 4 - SubGroupA: 64 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122146,43 +142536,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -122195,9 +142581,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122205,12 +142591,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -122221,13 +142609,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -122247,8 +142636,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122266,8 +142655,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 785 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM2 + SolutionIndex: 908 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -122275,24 +142664,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 2 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122310,58 +142697,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122370,12 +142753,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -122385,13 +142770,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -122411,8 +142797,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122430,8 +142816,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 786 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM2 + SolutionIndex: 909 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -122439,24 +142825,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 2 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122474,43 +142858,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -122523,9 +142903,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122533,13 +142913,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -122549,13 +142931,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -122575,8 +142958,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122594,33 +142977,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 787 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_16_1_WGM7 + SolutionIndex: 910 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 7 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122638,58 +143019,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -122697,13 +143074,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -122713,13 +143092,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -122739,8 +143119,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122758,8 +143138,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 788 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM7 + SolutionIndex: 911 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -122767,24 +143147,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 7 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122802,54 +143180,50 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2064 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -122862,11 +143236,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -122877,13 +143253,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -122903,8 +143280,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -122922,33 +143299,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 789 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM11 + SolutionIndex: 912 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 11 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122966,43 +143341,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1552 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123015,9 +143386,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123025,12 +143396,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -123041,13 +143414,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123067,8 +143441,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123086,8 +143460,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 790 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM16 + SolutionIndex: 913 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -123095,24 +143469,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123130,58 +143502,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123189,13 +143557,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -123205,13 +143575,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123231,8 +143602,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123250,8 +143621,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 791 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + SolutionIndex: 914 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -123259,24 +143630,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123288,60 +143657,56 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3104 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -123354,11 +143719,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -123369,13 +143736,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123395,8 +143763,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123414,33 +143782,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 792 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_8_2_WGM64 + SolutionIndex: 915 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123452,64 +143818,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123519,11 +143881,13 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -123533,13 +143897,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123559,8 +143924,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123578,8 +143943,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 793 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 + SolutionIndex: 916 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -123587,24 +143952,22 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123622,43 +143985,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123670,10 +144029,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123681,13 +144040,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -123697,13 +144058,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123723,8 +144085,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123742,33 +144104,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 794 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM8 + SolutionIndex: 917 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123786,43 +144146,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1600 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123835,9 +144191,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123847,11 +144203,13 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -123861,13 +144219,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -123887,8 +144246,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -123906,33 +144265,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 795 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM1 + SolutionIndex: 918 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123944,16 +144301,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123969,24 +144326,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 16 LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -123997,11 +144350,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124009,13 +144362,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -124025,13 +144380,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124051,8 +144407,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124070,20 +144426,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 796 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM1 + SolutionIndex: 919 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -124091,12 +144447,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124108,16 +144462,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -124133,24 +144487,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 16 - LSPB: 128 - LVCA: 32 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 LVCB: 4 - LVPA: 4 - LVPB: 32 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -124161,11 +144511,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124173,13 +144523,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -124189,13 +144539,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124215,8 +144566,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124234,20 +144585,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 797 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_32_1_WGM1 + SolutionIndex: 920 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 32 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -124255,10 +144606,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 32, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -124272,47 +144623,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -124325,11 +144672,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124337,8 +144684,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -124353,7 +144700,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -124379,8 +144726,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124398,8 +144745,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 798 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM8 + SolutionIndex: 921 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -124407,22 +144754,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -124442,9 +144789,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -124452,48 +144799,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 8 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 16 + LVCA: 8 LVCB: 4 - LVPA: 2 - LVPB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 128 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124501,10 +144844,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -124519,14 +144860,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124546,8 +144886,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124565,31 +144905,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 799 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 922 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124601,15 +144943,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -124617,31 +144959,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -124654,11 +144992,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124666,8 +145004,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -124682,14 +145020,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124709,8 +145046,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124728,8 +145065,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 800 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 923 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -124737,22 +145074,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -124766,15 +145103,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -124782,31 +145119,27 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -124819,11 +145152,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124831,10 +145164,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -124849,14 +145180,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -124876,8 +145206,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -124895,8 +145225,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 801 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 924 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -124904,22 +145234,24 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124931,15 +145263,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -124947,37 +145279,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 - LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsNumElements: 512 + LdsOffsetA: 0 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -124985,10 +145313,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124996,13 +145324,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -125012,14 +145340,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125039,8 +145366,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125058,31 +145385,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 802 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 925 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -125102,58 +145429,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125161,15 +145484,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -125179,14 +145500,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125206,8 +145526,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125225,31 +145545,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 803 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 926 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125269,56 +145591,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125326,12 +145648,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -125349,7 +145671,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125369,8 +145690,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125388,28 +145709,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 804 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 927 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -125432,58 +145753,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125491,14 +145808,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -125509,14 +145824,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125536,8 +145850,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125555,31 +145869,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 805 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 928 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125597,41 +145913,37 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -125645,10 +145957,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125656,12 +145968,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -125672,14 +145984,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125699,8 +146010,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125718,8 +146029,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 806 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 929 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125727,18 +146038,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -125756,49 +146067,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3344 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125809,10 +146116,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 96 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -125821,11 +146128,9 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 @@ -125839,14 +146144,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -125866,8 +146170,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -125885,8 +146189,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 807 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 930 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125894,22 +146198,24 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [6, 4] - ThreadTile0: 6 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125928,38 +146234,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -125971,9 +146277,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -125982,14 +146288,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -126007,7 +146311,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126027,8 +146330,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126046,8 +146349,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 808 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 931 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -126056,21 +146359,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126089,53 +146394,53 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 4 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126143,14 +146448,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -126162,13 +146465,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126188,8 +146490,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126207,31 +146509,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 809 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 932 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126243,13 +146547,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -126263,33 +146567,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3344 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -126297,10 +146597,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126308,11 +146608,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -126324,14 +146624,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126351,8 +146650,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126370,32 +146669,32 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 810 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 933 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -126408,14 +146707,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -126434,38 +146733,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6688 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126473,14 +146772,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -126492,13 +146789,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126518,8 +146814,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126537,31 +146833,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 811 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 934 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126573,14 +146871,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -126593,44 +146891,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6688 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1600 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126638,15 +146932,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -126656,14 +146948,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126683,8 +146974,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126702,31 +146993,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 812 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 935 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126738,7 +147031,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -126758,44 +147051,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6688 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1600 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126803,13 +147096,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -126820,13 +147113,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -126846,8 +147138,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -126865,31 +147157,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 813 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 936 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -126903,15 +147195,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -126919,37 +147211,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -126957,10 +147245,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -126968,15 +147256,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -126986,14 +147272,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127013,8 +147298,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127032,31 +147317,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 814 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 937 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127068,7 +147355,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -127076,7 +147363,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -127084,37 +147371,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -127122,10 +147409,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127133,13 +147420,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -127156,7 +147443,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127176,8 +147462,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127195,31 +147481,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 815 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 938 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB0_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -127233,15 +147519,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -127249,33 +147535,29 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1056 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -127286,11 +147568,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127298,10 +147580,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -127316,14 +147596,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127343,8 +147622,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127362,8 +147641,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 816 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 939 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127371,22 +147650,24 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127398,15 +147679,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -127414,33 +147695,29 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -127451,10 +147728,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -127463,12 +147740,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -127479,14 +147756,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127506,8 +147782,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127525,8 +147801,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 817 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 940 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127534,22 +147810,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -127570,38 +147846,38 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -127613,9 +147889,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -127624,14 +147900,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -127649,7 +147923,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127669,8 +147942,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127688,8 +147961,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 818 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 941 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127698,21 +147971,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127730,58 +148005,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 544 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127789,15 +148060,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -127807,14 +148076,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127834,8 +148102,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -127853,31 +148121,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 819 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 942 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127895,58 +148165,54 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127954,11 +148220,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -127970,14 +148236,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -127997,8 +148262,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128016,28 +148281,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 820 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 943 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -128060,54 +148325,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 16 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128115,10 +148384,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -128133,14 +148400,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128160,8 +148426,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128179,31 +148445,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 821 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 944 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128215,49 +148483,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -128268,11 +148536,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128280,14 +148548,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -128305,7 +148571,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128325,8 +148590,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128344,8 +148609,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 822 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 945 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -128354,21 +148619,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128380,7 +148647,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -128388,56 +148655,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128445,11 +148712,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -128462,13 +148729,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128488,8 +148754,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128507,31 +148773,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 823 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 946 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -128545,49 +148811,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 1088 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -128595,10 +148861,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128606,15 +148872,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128631,7 +148895,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128651,8 +148914,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128670,31 +148933,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 824 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 947 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128706,49 +148971,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -128756,9 +149025,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -128767,15 +149036,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128785,14 +149052,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128812,8 +149078,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128831,31 +149097,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 825 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 948 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128867,49 +149135,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -128917,9 +149189,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -128928,15 +149200,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -128946,14 +149216,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -128973,8 +149242,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -128992,31 +149261,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 826 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 949 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129028,59 +149299,59 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 LVCB: 16 - LVPA: 2 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 2048 LdsOffsetA: 0 LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -129089,15 +149360,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129114,7 +149383,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129134,8 +149402,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129153,14 +149421,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 827 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 950 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] @@ -129169,15 +149437,17 @@ ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129189,59 +149459,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 LVCB: 16 - LVPA: 2 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -129250,15 +149524,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129268,14 +149540,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129295,8 +149566,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129314,31 +149585,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 828 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 951 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129350,15 +149623,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -129366,37 +149639,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 3648 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -129404,9 +149677,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -129415,15 +149688,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129434,13 +149705,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129460,8 +149730,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129479,31 +149749,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 829 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 952 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129515,15 +149787,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -129531,37 +149803,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -129569,9 +149837,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -129580,13 +149848,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129596,14 +149864,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129623,8 +149890,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129642,31 +149909,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 830 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 953 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -129680,53 +149947,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3648 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -129734,10 +150001,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129745,15 +150012,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -129770,7 +150035,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129790,8 +150054,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129809,31 +150073,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 831 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 954 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129845,7 +150111,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -129853,7 +150119,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -129861,37 +150127,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 1856 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -129899,10 +150165,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129910,11 +150176,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -129933,7 +150199,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -129953,8 +150218,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -129972,31 +150237,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 832 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 955 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -130010,45 +150275,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -130059,7 +150328,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -130071,15 +150340,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -130089,14 +150356,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130116,8 +150382,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130135,31 +150401,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 833 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 956 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130171,7 +150439,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -130179,52 +150447,52 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130232,13 +150500,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -130255,7 +150523,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130275,8 +150542,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130294,31 +150561,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 834 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL1_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 957 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -130332,49 +150599,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1032 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -130382,10 +150653,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130393,14 +150664,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -130411,14 +150680,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130438,8 +150706,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130457,31 +150725,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 835 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 958 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130493,49 +150763,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1032 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -130543,10 +150813,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130554,14 +150824,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -130579,7 +150847,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130599,8 +150866,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130618,31 +150885,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 836 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 959 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130654,53 +150923,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3080 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -130708,10 +150977,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130719,14 +150988,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -130744,7 +151011,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130764,8 +151030,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130783,31 +151049,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 837 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 960 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130819,49 +151087,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 776 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -130869,9 +151141,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -130880,15 +151152,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -130898,14 +151168,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -130925,8 +151194,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -130944,31 +151213,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 838 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 961 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130980,49 +151251,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 648 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -131030,10 +151305,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131041,15 +151316,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131059,14 +151332,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131086,8 +151358,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131105,31 +151377,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 839 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 962 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131141,49 +151415,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 648 + LdsNumElements: 832 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -131191,10 +151465,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131202,13 +151476,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -131227,7 +151499,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131247,8 +151518,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131266,31 +151537,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 840 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 963 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131302,53 +151575,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3080 + LdsNumElements: 1856 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -131356,10 +151629,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131367,15 +151640,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131392,7 +151663,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131412,8 +151682,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131431,31 +151701,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 841 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 964 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131467,60 +151739,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 776 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131528,13 +151804,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 @@ -131546,14 +151820,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131573,8 +151846,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131592,31 +151865,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 842 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 965 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131628,49 +151903,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 648 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -131678,9 +151957,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -131689,15 +151968,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -131707,14 +151984,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131734,8 +152010,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131753,31 +152029,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 843 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 966 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131789,60 +152067,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2064 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131850,14 +152132,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -131868,14 +152148,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -131895,8 +152174,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -131914,31 +152193,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 844 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 967 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131950,59 +152231,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -132011,13 +152296,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -132029,14 +152312,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132056,8 +152338,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132075,31 +152357,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 845 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 968 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132111,60 +152395,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132172,15 +152460,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132190,14 +152476,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132217,8 +152502,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132236,31 +152521,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 846 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 969 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132272,60 +152559,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 64 + LVCA: 8 LVCB: 16 - LVPA: 2 - LVPB: 8 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132333,14 +152624,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -132351,14 +152640,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132378,8 +152666,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132397,31 +152685,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 847 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 970 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132433,59 +152723,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 64 + LVCA: 8 LVCB: 16 - LVPA: 2 - LVPB: 8 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -132494,13 +152788,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 @@ -132512,14 +152804,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132539,8 +152830,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132558,31 +152849,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 848 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 971 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132594,59 +152887,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 2 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -132655,15 +152952,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -132673,14 +152968,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132700,8 +152994,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132719,31 +153013,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 849 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 972 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132755,60 +153051,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2064 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132816,14 +153112,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -132841,7 +153135,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -132861,8 +153154,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -132880,31 +153173,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 850 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 973 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -132916,60 +153211,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 + LVCA: 8 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1552 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132977,14 +153276,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -132995,14 +153292,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133022,8 +153318,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133041,31 +153337,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 851 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 974 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133077,59 +153375,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 LVCB: 16 - LVPA: 2 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -133138,15 +153440,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -133156,14 +153456,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133183,8 +153482,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133202,14 +153501,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 852 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 975 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] @@ -133218,15 +153517,17 @@ ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133244,54 +153545,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 4 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3104 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133299,14 +153604,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -133317,14 +153620,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133344,8 +153646,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133363,15 +153665,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 853 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 976 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] ThreadTile0: 4 @@ -133379,15 +153681,17 @@ ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133399,49 +153703,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -133449,9 +153757,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -133460,15 +153768,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -133478,14 +153784,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133505,8 +153810,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133524,31 +153829,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 854 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 977 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133560,49 +153867,53 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 2 - LSPB: 16 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -133610,10 +153921,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133621,14 +153932,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -133639,14 +153948,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133666,8 +153974,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133685,31 +153993,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 855 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 978 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133721,60 +154031,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCA: 16 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 64 + LVCA: 8 LVCB: 16 - LVPA: 2 - LVPB: 8 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1600 + LdsNumElements: 1856 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133782,14 +154096,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -133800,14 +154112,13 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133827,8 +154138,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -133846,31 +154157,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 856 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 979 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133882,15 +154195,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -133898,29 +154211,29 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -133931,10 +154244,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -133943,14 +154256,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -133962,13 +154273,12 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133988,8 +154298,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134007,31 +154317,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 857 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 980 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -134043,7 +154355,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -134051,7 +154363,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -134059,29 +154371,29 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -134092,11 +154404,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134104,12 +154416,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -134127,7 +154439,6 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false - ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -134147,8 +154458,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134166,31 +154477,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 858 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 981 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134204,7 +154515,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -134224,39 +154535,39 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCA: 8 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 128 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -134265,8 +154576,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -134282,7 +154593,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -134307,8 +154618,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134326,14 +154637,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 859 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 982 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] @@ -134347,10 +154658,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134364,7 +154675,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -134389,14 +154700,14 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 16 - LVCA: 8 - LVCB: 4 - LVPA: 4 - LVPB: 8 + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 1024 LdsOffsetA: 0 @@ -134413,11 +154724,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134442,7 +154753,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -134467,8 +154778,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134486,15 +154797,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 860 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 983 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -134507,10 +154818,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134544,40 +154855,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 4 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 2 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 64 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134585,12 +154896,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -134627,8 +154938,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134646,15 +154957,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 861 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 984 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -134667,7 +154978,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [2, 32, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -134684,7 +154995,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -134709,35 +155020,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 64 + MacroTileA: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134745,8 +155056,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -134787,8 +155098,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134806,31 +155117,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 862 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 985 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -134878,26 +155189,26 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 800 LdsOffsetA: 0 LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 16 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 16 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -134905,12 +155216,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -134947,8 +155258,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -134966,15 +155277,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 863 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 986 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -134987,7 +155298,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -135024,40 +155335,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 8 LSCB: 16 LSPA: 16 - LSPB: 16 - LVCA: 8 + LSPB: 32 + LVCA: 4 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135065,13 +155376,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -135082,7 +155393,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -135107,8 +155418,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135126,15 +155437,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 864 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 987 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -135147,7 +155458,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -135170,7 +155481,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -135189,24 +155500,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 8 LSCB: 16 LSPA: 16 - LSPB: 16 - LVCA: 8 + LSPB: 32 + LVCA: 4 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -135218,10 +155525,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135245,7 +155552,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -135271,8 +155578,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135290,16 +155597,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 865 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 988 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -135311,7 +155618,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -135353,20 +155660,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 8 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 4 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -135378,10 +155685,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 8 + MacroTile1: 64 + MacroTileA: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135394,7 +155701,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -135406,7 +155713,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -135431,8 +155738,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135450,15 +155757,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 866 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 989 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 4] ThreadTile0: 2 @@ -135471,7 +155778,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -135508,40 +155815,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 4 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 2 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 1120 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -135549,12 +155856,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -135591,8 +155898,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135610,15 +155917,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 867 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM8 + SolutionIndex: 990 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -135631,8 +155938,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -135668,39 +155975,39 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 4 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 2 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 1120 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 64 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 4 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 4 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -135709,8 +156016,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -135726,7 +156033,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -135751,8 +156058,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135770,29 +156077,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 868 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 991 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -135814,7 +156121,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -135828,39 +156135,43 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 4 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 2 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 64 + LdsOffsetB_Blk: 2112 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 4 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 4 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -135869,8 +156180,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -135885,7 +156196,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -135911,8 +156222,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -135930,29 +156241,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 869 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 992 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [2, 32, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -135974,7 +156285,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -135993,20 +156304,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 4 LSCB: 16 LSPA: 16 - LSPB: 16 - LVCA: 8 + LSPB: 32 + LVCA: 2 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 64 + LdsOffsetB_Blk: 2112 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -136018,10 +156333,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136034,7 +156349,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -136045,8 +156360,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -136071,8 +156386,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136090,16 +156405,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 870 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 993 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: true ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -136111,7 +156426,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [2, 32, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -136153,20 +156468,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 8 LSCB: 32 LSPA: 32 LSPB: 16 - LVCA: 8 + LVCA: 4 LVCB: 16 LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 1344 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -136178,10 +156493,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136194,7 +156509,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -136206,7 +156521,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -136231,8 +156546,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136250,15 +156565,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 871 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 994 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -136271,7 +156586,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -136313,24 +156628,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -136342,10 +156657,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136357,8 +156672,8 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -136395,8 +156710,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136414,15 +156729,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 872 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 995 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 2] ThreadTile0: 4 @@ -136435,7 +156750,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -136458,7 +156773,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -136479,33 +156794,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 8 + LSPA: 32 + LSPB: 16 LVCA: 8 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 16 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136517,9 +156836,9 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -136529,8 +156848,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -136555,8 +156874,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136574,20 +156893,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 873 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 996 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -136595,7 +156914,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [4, 8, 8] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -136639,37 +156958,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 8 + LSPA: 32 + LSPB: 16 LVCA: 8 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 3136 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 16 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136681,9 +157000,9 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -136719,8 +157038,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136738,20 +157057,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 874 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 997 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -136759,7 +157078,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [4, 8, 8] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -136801,35 +157120,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 8 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -136841,9 +157160,9 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -136879,8 +157198,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -136898,29 +157217,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 875 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 998 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [4, 8, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -136953,7 +157272,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -136961,39 +157280,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 8 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137005,9 +157324,9 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -137043,8 +157362,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137062,28 +157381,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 876 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB0_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 999 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [4, 8, 8] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -137100,7 +157419,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -137120,40 +157439,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 4 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1056 + LdsNumElements: 2240 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 128 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 4 + MacroTile1: 64 + MacroTileA: 4 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137161,12 +157480,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -137203,8 +157522,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137222,15 +157541,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 877 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1000 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -137243,10 +157562,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [2, 32, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137260,13 +157579,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -137285,35 +157604,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137321,8 +157644,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -137337,8 +157660,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -137363,8 +157686,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137382,31 +157705,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 878 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1001 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137420,13 +157743,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -137440,40 +157763,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137481,12 +157808,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -137497,7 +157824,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -137523,8 +157850,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137542,31 +157869,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 879 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1002 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 8, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137580,7 +157907,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -137600,21 +157927,21 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 544 + LdsNumElements: 2048 LdsOffsetA: 0 LdsOffsetB: 256 LdsPadA: 0 @@ -137622,18 +157949,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137641,13 +157968,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -137683,8 +158010,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137702,31 +158029,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 880 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 1003 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [4, 8, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137740,13 +158067,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -137760,29 +158087,33 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -137790,10 +158121,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137801,12 +158132,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -137817,7 +158148,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -137843,8 +158174,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -137862,31 +158193,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 881 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1004 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [4, 8, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -137900,7 +158231,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -137920,33 +158251,33 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -137954,10 +158285,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -137965,12 +158296,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138007,8 +158338,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138026,15 +158357,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 882 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1005 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG2_16_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 2] ThreadTile0: 4 @@ -138047,10 +158378,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [2, 16, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -138084,7 +158415,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -138092,36 +158423,36 @@ LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3328 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138129,12 +158460,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138171,8 +158502,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138190,15 +158521,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 883 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 1006 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: true ThreadTile: [2, 2] ThreadTile0: 2 @@ -138211,7 +158542,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -138253,22 +158584,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPA: 16 + LSPB: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -138282,10 +158613,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138297,7 +158628,7 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -138310,7 +158641,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -138335,8 +158666,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138354,15 +158685,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 884 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1007 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: true ThreadTile: [2, 2] ThreadTile0: 2 @@ -138375,7 +158706,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -138398,7 +158729,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -138412,23 +158743,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -138442,9 +158777,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -138453,11 +158788,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -138469,7 +158804,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -138495,8 +158830,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138514,28 +158849,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 885 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1008 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -138572,43 +158907,43 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -138617,11 +158952,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -138659,8 +158994,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138678,14 +159013,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 886 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1009 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [2, 2] @@ -138699,7 +159034,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -138750,30 +159085,30 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138781,12 +159116,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -138798,7 +159133,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -138823,8 +159158,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -138842,28 +159177,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 887 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1010 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -138886,7 +159221,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -138900,7 +159235,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -138908,32 +159243,36 @@ LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -138941,7 +159280,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -138957,7 +159296,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -138983,8 +159322,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139002,16 +159341,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 888 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1011 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroupB: 4 + SuppressNoLoadLoop: true ThreadTile: [4, 2] ThreadTile0: 4 ThreadTile1: 2 @@ -139023,7 +159362,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -139060,43 +159399,43 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -139105,11 +159444,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -139147,8 +159486,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139166,28 +159505,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 889 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1012 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -139224,27 +159563,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPA: 16 + LSPB: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -139258,10 +159597,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139269,12 +159608,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -139286,7 +159625,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -139311,8 +159650,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139330,29 +159669,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 890 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1013 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -139374,7 +159713,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -139393,18 +159732,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -139418,10 +159761,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139433,8 +159776,8 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -139445,7 +159788,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -139471,8 +159814,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139490,16 +159833,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 891 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1014 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: true ThreadTile: [2, 4] ThreadTile0: 2 ThreadTile1: 4 @@ -139511,8 +159854,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -139553,39 +159896,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 32 + LSPA: 8 LSPB: 16 - LVCA: 8 + LVCA: 32 LVCB: 16 - LVPA: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139593,12 +159936,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -139635,8 +159978,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139654,29 +159997,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 892 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1015 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -139719,37 +160062,37 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 8 + LSPA: 32 + LSPB: 16 LVCA: 8 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1856 + LdsNumElements: 3136 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 16 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -139761,9 +160104,9 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -139799,8 +160142,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139818,8 +160161,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 893 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1016 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -139828,19 +160171,19 @@ SubGroupA: 8 SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -139883,12 +160226,12 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 16 + LSPA: 16 + LSPB: 32 LVCA: 32 LVCB: 16 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 @@ -139902,14 +160245,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -139921,13 +160264,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -139963,8 +160306,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -139982,28 +160325,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 894 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM8 + SolutionIndex: 1017 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG32_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -140026,7 +160369,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -140037,7 +160380,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -140045,18 +160388,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPA: 16 + LSPB: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -140070,10 +160417,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140085,7 +160432,7 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -140097,8 +160444,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -140123,8 +160470,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140142,15 +160489,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 895 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1018 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -140163,8 +160510,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -140197,7 +160544,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -140226,14 +160573,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 MacroTile1: 16 MacroTileA: 16 @@ -140262,7 +160609,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -140287,8 +160634,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140306,29 +160653,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 896 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1019 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -140361,42 +160708,42 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -140405,11 +160752,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -140447,8 +160794,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140466,29 +160813,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 897 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1020 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -140521,46 +160868,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -140569,11 +160916,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -140611,8 +160958,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140630,29 +160977,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 898 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1021 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -140685,10 +161032,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -140702,9 +161049,9 @@ LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3136 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -140714,18 +161061,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 16 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140733,12 +161080,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -140750,7 +161097,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -140775,8 +161122,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140794,16 +161141,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 899 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1022 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroupB: 4 + SuppressNoLoadLoop: false ThreadTile: [2, 4] ThreadTile0: 2 ThreadTile1: 4 @@ -140815,7 +161162,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 4, 8] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -140832,7 +161179,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -140857,28 +161204,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -140886,10 +161233,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -140897,12 +161244,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -140939,8 +161286,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -140958,20 +161305,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 900 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1023 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -140979,10 +161326,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -140996,13 +161343,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -141016,29 +161363,33 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 832 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141046,10 +161397,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141057,13 +161408,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141073,7 +161424,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -141099,8 +161450,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141118,31 +161469,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 901 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1024 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -141156,13 +161507,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -141176,33 +161527,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141210,10 +161557,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141221,13 +161568,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141237,8 +161584,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -141263,8 +161610,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141282,31 +161629,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 902 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1025 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -141320,13 +161667,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -141337,7 +161684,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -141345,28 +161692,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -141374,10 +161717,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141385,13 +161728,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -141401,8 +161744,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -141427,8 +161770,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141446,31 +161789,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 903 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 + SolutionIndex: 1026 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -141484,7 +161827,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -141492,56 +161835,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141549,11 +161892,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -141566,7 +161909,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -141591,8 +161934,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141610,31 +161953,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 904 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1027 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -141648,7 +161991,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -141656,56 +161999,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141713,8 +162056,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -141755,8 +162098,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141774,31 +162117,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 905 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 + SolutionIndex: 1028 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -141820,56 +162163,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -141877,11 +162220,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -141919,8 +162262,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -141938,29 +162281,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 906 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1029 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -141976,7 +162319,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -141993,36 +162336,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1856 + LdsNumElements: 3088 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -142030,10 +162373,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142041,13 +162384,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142058,7 +162401,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -142083,8 +162426,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142102,31 +162445,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 907 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1030 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -142140,13 +162483,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -142157,36 +162500,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 1040 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -142194,10 +162533,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142205,13 +162544,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142221,7 +162560,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -142247,8 +162586,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142266,31 +162605,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 908 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 + SolutionIndex: 1031 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -142304,7 +162643,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -142321,7 +162660,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -142329,16 +162668,16 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 3088 LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 @@ -142350,7 +162689,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -142358,10 +162697,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142369,13 +162708,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142411,8 +162750,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142430,20 +162769,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 909 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_4_WGM1 + SolutionIndex: 1032 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -142451,10 +162790,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -142468,7 +162807,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -142485,30 +162824,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -142521,11 +162860,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142533,13 +162872,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142550,7 +162889,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -142575,8 +162914,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142594,31 +162933,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 910 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1033 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -142632,7 +162971,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -142649,43 +162988,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142693,12 +163032,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -142735,8 +163074,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142754,31 +163093,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 911 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1034 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -142792,7 +163131,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -142809,30 +163148,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -142845,11 +163184,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -142857,13 +163196,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -142899,8 +163238,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -142918,31 +163257,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 912 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1035 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -142956,13 +163295,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -142973,7 +163312,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -142982,21 +163321,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -143009,11 +163344,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143021,13 +163356,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143037,7 +163372,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -143063,8 +163398,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143082,31 +163417,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 913 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1036 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG8_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -143120,7 +163455,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -143137,7 +163472,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -143146,21 +163481,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -143173,11 +163508,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143185,13 +163520,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143227,8 +163562,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143246,31 +163581,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 914 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 + SolutionIndex: 1037 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -143284,13 +163619,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -143301,7 +163636,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -143309,39 +163644,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143349,11 +163680,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -143365,8 +163696,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -143391,8 +163722,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143410,20 +163741,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 915 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1038 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -143431,10 +163762,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -143448,13 +163779,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -143465,47 +163796,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143513,13 +163840,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143529,8 +163856,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -143555,8 +163882,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143574,31 +163901,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 916 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1039 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -143629,30 +163956,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 + LSPA: 32 + LSPB: 64 + LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1856 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -143666,10 +163993,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -143677,13 +164004,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143719,8 +164046,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143738,29 +164065,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 917 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 + SolutionIndex: 1040 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -143776,13 +164103,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -143801,34 +164128,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSPB: 32 + LVCA: 32 + LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -143841,9 +164172,9 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -143853,8 +164184,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -143879,8 +164210,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -143898,31 +164229,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 918 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 + SolutionIndex: 1041 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 1] + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -143936,13 +164267,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -143961,24 +164292,28 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 128 + LSCB: 32 LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSPB: 64 + LVCA: 64 + LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -143986,9 +164321,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -144001,9 +164336,9 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144013,7 +164348,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -144039,8 +164374,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144058,20 +164393,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 919 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_16_1_WGM1 + SolutionIndex: 1042 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG32_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 32 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -144079,10 +164414,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 1] + WorkGroup: [32, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144096,13 +164431,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -144116,25 +164451,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 32 + LVCB: 16 + LVPA: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -144145,11 +164484,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144157,13 +164496,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144173,8 +164512,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -144199,8 +164538,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144218,31 +164557,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 920 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1043 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144256,13 +164595,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -144276,25 +164615,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 4 - LVCB: 8 + LVCA: 32 + LVCB: 16 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -144305,10 +164648,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -144317,13 +164660,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144333,7 +164676,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -144359,8 +164702,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144378,31 +164721,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 921 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1044 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144422,53 +164765,57 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 2 - LVCB: 8 - LVPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -144477,12 +164824,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -144493,8 +164840,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -144519,8 +164866,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144538,28 +164885,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 922 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1045 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -144576,59 +164923,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 128 + LSCB: 16 LSPA: 8 LSPB: 64 - LVCA: 4 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 32 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 8 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -144637,11 +164988,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -144653,7 +165004,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -144679,8 +165030,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144698,31 +165049,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 923 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1046 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -144744,37 +165095,37 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -144786,10 +165137,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144797,13 +165148,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -144839,8 +165190,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -144858,29 +165209,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 924 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 + SolutionIndex: 1047 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -144902,54 +165253,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -144957,8 +165312,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -144973,8 +165328,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -144999,8 +165354,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145018,29 +165373,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 925 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1048 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -145062,54 +165417,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145117,12 +165476,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -145133,7 +165492,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -145159,8 +165518,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145178,29 +165537,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 926 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1049 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -145222,53 +165581,57 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 8 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -145277,12 +165640,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -145293,8 +165656,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -145319,8 +165682,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145338,29 +165701,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 927 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1050 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -145376,59 +165739,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 2 + LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1120 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -145437,11 +165804,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -145453,8 +165820,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -145479,8 +165846,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145498,31 +165865,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 928 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1051 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145536,15 +165903,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -145552,29 +165919,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 - LSPA: 16 + LSCA: 64 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 2 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1120 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 64 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -145585,11 +165956,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 - MacroTile1: 64 - MacroTileA: 4 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145597,13 +165968,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -145613,7 +165984,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -145639,8 +166010,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145658,31 +166029,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 929 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1052 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145696,7 +166067,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -145704,41 +166075,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 2 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 64 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3168 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 64 - LdsOffsetB_Blk: 2112 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -145749,10 +166120,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 4 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -145761,13 +166132,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -145803,8 +166174,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145822,31 +166193,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 930 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1053 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -145860,7 +166231,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -145868,7 +166239,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -145876,33 +166247,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 - LSCB: 16 + LSCA: 128 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 2 - LVCB: 8 - LVPA: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3168 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 64 - LdsOffsetB_Blk: 2112 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -145913,11 +166284,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 4 - MacroTile1: 64 - MacroTileA: 4 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -145925,13 +166296,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -145967,8 +166338,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -145986,31 +166357,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 931 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM8 + SolutionIndex: 1054 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -146030,39 +166401,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSPB: 64 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1344 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -146074,10 +166449,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146085,13 +166460,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -146101,7 +166476,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -146127,8 +166502,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146146,28 +166521,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 932 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1055 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [8, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -146192,7 +166567,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -146200,48 +166575,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 128 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPB: 64 + LVCA: 32 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146249,13 +166624,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -146291,8 +166666,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146310,28 +166685,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 933 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM1 + SolutionIndex: 1056 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 32 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 32 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [32, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -146356,7 +166731,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -146364,48 +166739,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPB: 32 + LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146413,13 +166788,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -146430,7 +166805,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -146455,8 +166830,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146474,29 +166849,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 934 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 + SolutionIndex: 1057 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -146520,56 +166895,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSPB: 64 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146577,13 +166952,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -146619,8 +166994,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146638,29 +167013,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 935 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 + SolutionIndex: 1058 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -146682,9 +167057,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -146692,44 +167067,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 128 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 4 + LSPB: 64 + LVCA: 32 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146737,13 +167116,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -146753,7 +167132,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -146779,8 +167158,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146798,29 +167177,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 936 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM1 + SolutionIndex: 1059 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 32 + SubGroup1: 16 + SubGroupA: 32 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -146844,56 +167223,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -146901,11 +167280,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -146943,8 +167322,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -146962,29 +167341,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 937 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM1 + SolutionIndex: 1060 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -147006,9 +167385,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -147016,29 +167395,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 4 + LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 2 + LSPB: 32 + LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2240 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -147050,10 +167433,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 4 - MacroTile1: 64 - MacroTileA: 4 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147061,13 +167444,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -147077,8 +167460,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -147103,8 +167486,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147122,29 +167505,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 938 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 + SolutionIndex: 1061 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR0_SNLL1_TT4_4_VW4_WG16_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [2, 32, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -147168,41 +167551,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSPB: 64 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -147214,10 +167597,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147225,13 +167608,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -147267,8 +167650,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147286,29 +167669,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 939 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM8 + SolutionIndex: 1062 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -147332,56 +167715,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSPB: 64 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147389,13 +167772,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -147431,8 +167814,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147450,29 +167833,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 940 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM8 + SolutionIndex: 1063 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -147494,9 +167877,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -147504,44 +167887,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 + LSCA: 128 LSCB: 32 LSPA: 32 - LSPB: 16 - LVCA: 4 + LSPB: 64 + LVCA: 32 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147549,13 +167936,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -147565,7 +167952,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -147591,8 +167978,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147610,29 +167997,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 941 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1064 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 32 + SubGroup1: 16 + SubGroupA: 32 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [32, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -147648,13 +168035,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -147668,44 +168055,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 2 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147713,12 +168096,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -147729,8 +168112,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -147755,8 +168138,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147774,20 +168157,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 942 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1065 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 4 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -147795,10 +168178,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -147812,13 +168195,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -147832,44 +168215,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 2 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -147877,12 +168256,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -147893,7 +168272,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -147919,8 +168298,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -147938,31 +168317,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 943 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG2_16_8_WGM8 + SolutionIndex: 1066 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [2, 16, 8] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -147984,56 +168363,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148041,12 +168420,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -148083,8 +168462,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148102,29 +168481,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 944 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 + SolutionIndex: 1067 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -148140,13 +168519,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -148160,44 +168539,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148205,11 +168580,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -148221,7 +168596,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -148247,8 +168622,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148266,31 +168641,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 945 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 + SolutionIndex: 1068 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -148304,13 +168679,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -148330,38 +168705,34 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148369,11 +168740,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -148385,7 +168756,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -148411,8 +168782,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148430,20 +168801,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 946 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM1 + SolutionIndex: 1069 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -148451,10 +168822,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -148468,7 +168839,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -148494,21 +168865,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -148521,11 +168892,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148533,11 +168904,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -148575,8 +168946,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148594,31 +168965,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 947 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_8_2_WGM1 + SolutionIndex: 1070 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -148632,13 +169003,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -148658,21 +169029,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -148685,11 +169052,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148697,11 +169064,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -148713,7 +169080,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -148739,8 +169106,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148758,20 +169125,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 948 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_8_2_WGM1 + SolutionIndex: 1071 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -148779,10 +169146,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -148796,7 +169163,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -148816,44 +169183,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -148861,12 +169228,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -148903,8 +169270,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -148922,31 +169289,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 949 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_4_8_WGM1 + SolutionIndex: 1072 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 8] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -148960,13 +169327,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -148980,33 +169347,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149015,9 +169378,9 @@ LoopTail: true LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 16 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149025,12 +169388,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -149041,7 +169404,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -149067,8 +169430,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149086,20 +169449,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 950 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1073 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -149107,10 +169470,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 8] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149124,13 +169487,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -149144,33 +169507,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149179,9 +169538,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149189,12 +169548,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -149205,7 +169564,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -149231,8 +169590,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149250,31 +169609,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 951 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM8 + SolutionIndex: 1074 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149288,7 +169647,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -149314,27 +169673,27 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149343,9 +169702,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149353,12 +169712,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -149395,8 +169754,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149414,20 +169773,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 952 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM8 + SolutionIndex: 1075 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -149435,10 +169794,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149452,7 +169811,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -149477,22 +169836,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -149505,11 +169864,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149517,12 +169876,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -149559,8 +169918,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149578,31 +169937,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 953 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_8_2_WGM8 + SolutionIndex: 1076 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149636,7 +169995,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -149650,9 +170009,9 @@ LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -149671,9 +170030,9 @@ LoopTail: true LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 16 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149681,12 +170040,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -149723,8 +170082,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149742,20 +170101,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 954 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM8 + SolutionIndex: 1077 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -149763,7 +170122,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 8] + WorkGroup: [4, 8, 8] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -149780,7 +170139,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -149788,41 +170147,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -149833,11 +170192,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -149845,13 +170204,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -149887,8 +170246,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -149906,31 +170265,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 955 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG32_8_2_WGM8 + SolutionIndex: 1078 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -149944,7 +170303,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -149952,45 +170311,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -149998,10 +170357,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150009,11 +170368,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -150026,7 +170385,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -150051,8 +170410,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150070,31 +170429,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 956 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG16_4_4_WGM1 + SolutionIndex: 1079 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -150108,7 +170467,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -150116,56 +170475,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150173,8 +170532,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -150190,7 +170549,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -150215,8 +170574,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150234,31 +170593,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 957 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1080 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -150278,54 +170637,58 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150333,12 +170696,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -150349,7 +170712,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -150375,8 +170738,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150394,28 +170757,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 958 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1081 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -150440,56 +170803,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150497,12 +170860,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -150539,8 +170902,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150558,29 +170921,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 959 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 + SolutionIndex: 1082 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -150604,56 +170967,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150661,7 +171024,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -150703,8 +171066,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150722,29 +171085,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 960 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM8 + SolutionIndex: 1083 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -150760,7 +171123,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -150785,39 +171148,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150825,8 +171188,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -150867,8 +171230,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -150886,15 +171249,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 961 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1084 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -150907,10 +171270,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -150924,13 +171287,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -150949,39 +171312,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -150989,8 +171348,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -151005,7 +171364,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -151031,8 +171390,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151050,16 +171409,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 962 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1085 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -151071,10 +171430,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151088,13 +171447,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -151113,35 +171472,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151149,8 +171512,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -151165,8 +171528,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -151191,8 +171554,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151210,16 +171573,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 963 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1086 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -151231,10 +171594,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151248,13 +171611,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -151273,35 +171636,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151309,8 +171676,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -151325,8 +171692,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -151351,8 +171718,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151370,16 +171737,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 964 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1087 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -151391,10 +171758,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151408,7 +171775,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -151416,56 +171783,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151473,12 +171840,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -151515,8 +171882,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151534,31 +171901,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 965 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1088 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151572,7 +171939,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -151580,56 +171947,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 16 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151637,12 +172004,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -151679,8 +172046,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151698,15 +172065,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 966 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1089 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -151718,11 +172085,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151736,7 +172103,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -151745,7 +172112,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -151762,38 +172129,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151801,12 +172168,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -151843,8 +172210,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -151862,15 +172229,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 967 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1090 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -151883,10 +172250,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -151900,7 +172267,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -151908,7 +172275,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -151916,37 +172283,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3088 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -151955,9 +172322,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -151965,8 +172332,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -152007,8 +172374,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152026,15 +172393,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 968 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1091 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -152046,11 +172413,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -152064,15 +172431,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -152080,33 +172447,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1040 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -152115,9 +172486,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152125,8 +172496,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -152141,7 +172512,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -152167,8 +172538,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152186,16 +172557,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 969 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1092 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -152206,11 +172577,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -152224,7 +172595,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -152232,45 +172603,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3088 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -152290,7 +172661,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -152331,8 +172702,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152350,31 +172721,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 970 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1093 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -152388,7 +172759,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -152396,52 +172767,52 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -152453,13 +172824,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152495,8 +172866,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152514,8 +172885,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 971 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 + SolutionIndex: 1094 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -152534,11 +172905,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -152552,15 +172923,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -152568,44 +172939,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 + LSCA: 128 + LSCB: 32 + LSPA: 16 LSPB: 32 LVCA: 32 - LVCB: 8 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152613,13 +172988,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152629,7 +173004,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -152655,8 +173030,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152674,31 +173049,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 972 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1095 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG32_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [32, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -152712,7 +173087,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -152720,41 +173095,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 6272 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -152765,10 +173140,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -152778,12 +173153,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152819,8 +173194,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152838,14 +173213,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 973 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 + SolutionIndex: 1096 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] @@ -152858,11 +173233,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -152876,60 +173251,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -152937,13 +173316,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -152953,8 +173332,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -152979,8 +173358,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -152998,16 +173377,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 974 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG8_16_4_WGM8 + SolutionIndex: 1097 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -153018,11 +173397,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153036,49 +173415,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 LVCB: 8 LVPA: 8 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -153089,11 +173464,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153102,12 +173477,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -153117,7 +173492,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -153143,8 +173518,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153162,16 +173537,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 975 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_4_WGM8 + SolutionIndex: 1098 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR1_SNLL0_TT4_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -153182,11 +173557,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 4] + VectorWidth: 4 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153200,60 +173575,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 + LSCA: 32 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153261,12 +173640,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -153277,8 +173656,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -153303,8 +173682,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153322,16 +173701,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 976 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1099 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -153342,11 +173721,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153360,60 +173739,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 8 + LSCB: 32 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153421,12 +173804,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -153437,7 +173820,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -153463,8 +173846,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153482,31 +173865,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 977 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1100 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -153528,56 +173911,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 64 - LVCA: 32 - LVCB: 16 - LVPA: 16 - LVPB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 7296 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153585,13 +173968,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -153627,8 +174010,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153646,15 +174029,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 978 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM1 + SolutionIndex: 1101 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -153666,9 +174049,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -153692,41 +174075,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 32 - LVCB: 16 + LVCA: 8 + LVCB: 8 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -153738,9 +174121,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -153750,12 +174133,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -153791,8 +174174,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153810,14 +174193,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 979 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 + SolutionIndex: 1102 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] @@ -153830,9 +174213,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -153854,7 +174237,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -153865,7 +174248,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -153873,22 +174256,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 64 - LVCA: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -153902,10 +174281,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -153917,9 +174296,9 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -153929,7 +174308,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -153955,8 +174334,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -153974,16 +174353,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 980 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG32_16_2_WGM1 + SolutionIndex: 1103 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 16 - SubGroupA: 32 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -153995,7 +174374,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 16, 2] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -154018,7 +174397,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -154029,7 +174408,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -154039,37 +174418,33 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 64 + LSPA: 8 + LSPB: 16 LVCA: 32 LVCB: 16 - LVPA: 16 - LVPB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154077,13 +174452,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154093,7 +174468,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -154119,8 +174494,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154138,16 +174513,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 981 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM8 + SolutionIndex: 1104 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -154159,7 +174534,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -154193,7 +174568,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -154201,22 +174576,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 32 - LVCA: 32 + LSPB: 16 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -154230,9 +174605,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -154246,8 +174621,8 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -154283,8 +174658,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154302,16 +174677,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 982 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 + SolutionIndex: 1105 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -154323,7 +174698,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -154340,7 +174715,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -154348,31 +174723,31 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 16 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 @@ -154382,22 +174757,22 @@ LdsOffsetB: 1024 LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154405,12 +174780,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -154447,8 +174822,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154466,16 +174841,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 983 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1106 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -154486,11 +174861,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -154504,64 +174879,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 16 + LSCB: 8 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 8 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154569,11 +174940,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -154585,8 +174956,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -154611,8 +174982,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154630,31 +175001,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 984 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1107 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -154668,7 +175039,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -154676,37 +175047,37 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -154717,11 +175088,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154729,8 +175100,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -154746,7 +175117,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -154771,8 +175142,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154790,8 +175161,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 985 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1108 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -154800,21 +175171,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -154834,43 +175205,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -154882,10 +175249,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -154893,8 +175260,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -154909,8 +175276,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -154935,8 +175302,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -154954,8 +175321,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 986 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1109 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -154963,18 +175330,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 16 WorkGroupMappingType: B @@ -154998,43 +175365,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -155046,10 +175409,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155057,12 +175420,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -155073,7 +175436,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -155099,8 +175462,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155118,8 +175481,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 987 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM16 + SolutionIndex: 1110 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -155127,18 +175490,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 16 WorkGroupMappingType: B @@ -155156,49 +175519,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 32 + LSCB: 8 LSPA: 8 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 528 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -155209,11 +175568,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155221,11 +175580,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -155237,7 +175596,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -155263,8 +175622,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155282,8 +175641,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 988 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1111 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -155291,22 +175650,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155320,64 +175679,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155385,12 +175740,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -155401,8 +175756,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -155427,8 +175782,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155446,31 +175801,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 989 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1112 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155484,15 +175839,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -155500,37 +175855,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 16 - LVCB: 16 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -155538,9 +175889,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -155549,13 +175900,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155565,7 +175916,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -155591,8 +175942,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155610,31 +175961,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 990 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM1 + SolutionIndex: 1113 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155648,7 +175999,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -155656,45 +176007,45 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 - LSPB: 64 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -155702,10 +176053,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155713,13 +176064,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155755,8 +176106,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155774,31 +176125,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 991 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM1 + SolutionIndex: 1114 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155812,15 +176163,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -155828,48 +176179,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -155878,12 +176225,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -155893,8 +176240,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -155919,8 +176266,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -155938,31 +176285,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 992 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_8_4_WGM1 + SolutionIndex: 1115 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -155982,43 +176329,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 LSPA: 32 - LSPB: 64 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -156030,10 +176373,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156041,13 +176384,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156057,7 +176400,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -156083,8 +176426,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156102,29 +176445,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 993 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM1 + SolutionIndex: 1116 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -156148,7 +176491,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -156156,48 +176499,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 16 LSCB: 32 LSPA: 32 - LSPB: 64 - LVCA: 32 + LSPB: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 - LVPB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156205,13 +176548,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156247,8 +176590,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156266,29 +176609,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 994 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM1 + SolutionIndex: 1117 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 16 - SubGroupA: 32 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -156312,7 +176655,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -156320,33 +176663,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -156358,10 +176701,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156369,13 +176712,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156411,8 +176754,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156430,29 +176773,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 995 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM8 + SolutionIndex: 1118 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -156476,41 +176819,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 64 + LSPA: 16 + LSPB: 8 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -156522,10 +176865,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156533,13 +176876,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156575,8 +176918,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156594,29 +176937,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 996 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM8 + SolutionIndex: 1119 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -156640,7 +176983,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -156648,48 +176991,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 64 - LVCA: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156697,13 +177040,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -156739,8 +177082,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156758,29 +177101,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 997 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM8 + SolutionIndex: 1120 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 16 - SubGroupA: 32 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -156805,7 +177148,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -156824,36 +177167,36 @@ LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 12416 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -156861,12 +177204,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -156903,8 +177246,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -156922,15 +177265,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 998 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1121 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -156943,8 +177286,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -156985,38 +177328,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 4096 LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -157025,11 +177368,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 512 PackBatchDims: 0 @@ -157042,7 +177385,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -157067,8 +177410,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157086,14 +177429,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 999 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR0_SNLL1_TT4_4_VW4_WG16_8_4_WGM16 + SolutionIndex: 1122 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] @@ -157107,8 +177450,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [32, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -157133,7 +177476,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -157151,20 +177494,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -157179,9 +177522,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157189,13 +177532,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -157231,8 +177574,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157250,29 +177593,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1000 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM16 + SolutionIndex: 1123 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -157297,7 +177640,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -157315,20 +177658,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -157343,9 +177686,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157353,13 +177696,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -157395,8 +177738,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157414,20 +177757,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1001 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM16 + SolutionIndex: 1124 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -157435,7 +177778,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -157479,16 +177822,16 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 32 - LSPA: 32 - LSPB: 64 + LSPA: 16 + LSPB: 32 LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 13440 LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 LdsOffsetB: 4096 @@ -157507,9 +177850,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -157521,9 +177864,9 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -157559,8 +177902,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157578,15 +177921,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1002 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM16 + SolutionIndex: 1125 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 32 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 32 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -157599,7 +177942,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 16, 2] + WorkGroup: [32, 8, 2] WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -157608,7 +177951,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -157616,59 +177959,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 8 + LVCA: 16 LVCB: 4 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -157677,8 +178025,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -157693,13 +178043,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -157719,8 +178071,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157738,14 +178090,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1003 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1126 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -157758,17 +178110,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -157776,59 +178126,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 8 + LVCA: 16 LVCB: 4 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -157837,8 +178192,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -157853,13 +178208,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -157879,8 +178236,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -157898,14 +178255,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1004 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1127 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -157918,17 +178275,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -157936,7 +178293,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -157944,55 +178301,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -158001,12 +178359,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -158017,13 +178375,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158043,8 +178403,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158062,37 +178422,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1005 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1128 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158106,7 +178466,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -158124,35 +178484,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 96 LSCB: 8 - LSPA: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 + LVCA: 48 LVCB: 4 - LVPA: 4 + LVPA: 3 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -158161,11 +178526,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -158177,13 +178542,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158203,8 +178570,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158222,20 +178589,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1006 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1129 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -158243,16 +178610,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158266,7 +178633,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -158284,35 +178651,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 96 LSCB: 8 - LSPA: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 + LVCA: 48 LVCB: 4 - LVPA: 4 + LVPA: 3 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -158321,11 +178693,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -158337,13 +178709,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158363,8 +178737,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158382,20 +178756,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1007 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1130 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -158403,16 +178777,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158444,39 +178818,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 96 LSCB: 8 - LSPA: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 + LVCA: 48 LVCB: 4 - LVPA: 4 + LVPA: 3 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -158485,11 +178860,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -158501,6 +178876,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -158508,6 +178884,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158527,8 +178904,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158546,20 +178923,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1008 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1131 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -158567,16 +178944,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158590,8 +178967,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -158608,36 +178985,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158645,12 +179027,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -158661,13 +179045,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158687,8 +179073,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158706,37 +179092,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1009 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM1 + SolutionIndex: 1132 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158744,7 +179128,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -158753,14 +179137,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -158768,29 +179152,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3328 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -158798,10 +179183,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -158809,12 +179194,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -158825,6 +179210,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -158832,6 +179218,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -158851,8 +179238,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -158870,37 +179257,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1010 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM1 + SolutionIndex: 1133 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -158908,59 +179295,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 2 LSPB: 32 - LVCA: 8 + LVCA: 128 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -158969,11 +179361,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -158985,13 +179379,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159011,8 +179407,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159030,37 +179426,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1011 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1134 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -159068,14 +179462,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -159092,25 +179486,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -159118,9 +179517,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -159129,12 +179528,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -159145,13 +179546,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159171,8 +179574,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159190,20 +179593,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1012 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1135 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -159211,16 +179614,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -159228,14 +179629,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -159252,29 +179653,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -159282,9 +179684,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -159293,12 +179695,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -159309,6 +179713,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -159316,6 +179721,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159335,8 +179741,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159354,20 +179760,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1013 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1136 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -159375,16 +179781,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -159392,15 +179796,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -159408,37 +179812,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -159446,9 +179851,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -159457,12 +179862,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -159473,13 +179880,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159499,8 +179908,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159518,37 +179927,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1014 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1137 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -159556,15 +179963,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -159572,48 +179979,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -159621,12 +180029,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -159637,6 +180047,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -159644,6 +180055,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159663,8 +180075,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159682,37 +180094,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1015 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1138 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -159720,16 +180130,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -159744,29 +180154,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -159774,9 +180185,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -159785,8 +180196,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -159801,6 +180214,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -159808,6 +180222,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159827,8 +180242,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -159846,16 +180261,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1016 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 1139 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -159867,16 +180282,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -159891,7 +180304,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -159908,39 +180321,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -159949,11 +180363,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -159965,6 +180381,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -159972,6 +180389,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -159991,8 +180409,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160010,16 +180428,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1017 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM8 + SolutionIndex: 1140 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -160031,16 +180449,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -160048,7 +180464,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160056,45 +180472,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 16 + LVCA: 64 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -160102,9 +180519,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -160113,11 +180530,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -160129,13 +180546,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160155,8 +180574,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160174,16 +180593,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1018 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1141 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -160194,17 +180613,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -160212,7 +180631,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160220,55 +180639,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -160277,12 +180697,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -160293,6 +180713,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -160300,6 +180721,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160319,8 +180741,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160338,37 +180760,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1019 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 1142 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -160376,7 +180798,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160385,7 +180807,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -160400,39 +180822,40 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -160441,12 +180864,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -160457,13 +180880,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160483,8 +180908,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160502,20 +180927,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1020 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1143 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -160523,16 +180948,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -160540,7 +180965,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160549,7 +180974,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -160564,19 +180989,20 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -160586,18 +181012,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160605,8 +181031,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -160621,13 +181047,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160647,8 +181075,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160666,20 +181094,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1021 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 + SolutionIndex: 1144 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -160687,16 +181115,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -160704,7 +181132,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -160712,7 +181140,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -160720,48 +181148,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160769,8 +181198,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -160785,6 +181214,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -160792,6 +181222,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160811,8 +181242,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160830,37 +181261,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1022 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1145 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -160868,15 +181299,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -160884,44 +181315,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -160929,8 +181365,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -160945,13 +181381,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -160971,8 +181409,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -160990,37 +181428,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1023 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1146 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161036,56 +181474,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161093,11 +181532,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -161109,6 +181548,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -161116,6 +181556,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161135,8 +181576,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -161154,29 +181595,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1024 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1147 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -161184,7 +181625,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161192,53 +181633,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -161246,10 +181688,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161257,12 +181699,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -161273,6 +181717,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -161280,6 +181725,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161299,8 +181745,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -161318,37 +181764,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1025 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1148 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161356,14 +181800,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -161380,29 +181824,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -161410,10 +181855,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161421,8 +181866,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -161437,13 +181884,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161463,8 +181912,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -161482,37 +181931,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1026 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1149 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161520,14 +181967,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -161544,29 +181991,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -161574,10 +182022,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161585,8 +182033,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -161601,6 +182051,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -161608,6 +182059,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161627,8 +182079,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -161646,37 +182098,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1027 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1150 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161684,16 +182134,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -161708,29 +182158,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -161738,10 +182189,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161749,8 +182200,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -161765,13 +182218,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161791,8 +182246,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -161810,37 +182265,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1028 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 + SolutionIndex: 1151 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -161848,16 +182301,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -161872,29 +182325,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -161902,10 +182356,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -161913,8 +182367,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -161929,13 +182385,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -161955,8 +182413,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -161974,37 +182432,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1029 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1152 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162012,16 +182468,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -162036,29 +182492,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162066,10 +182523,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162077,8 +182534,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -162093,6 +182552,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -162100,6 +182560,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162119,8 +182580,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -162138,37 +182599,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1030 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1153 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162176,14 +182635,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -162200,29 +182659,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162230,10 +182690,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162241,8 +182701,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -162257,6 +182719,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -162264,6 +182727,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162283,8 +182747,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -162302,20 +182766,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1031 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + SolutionIndex: 1154 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -162323,16 +182787,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162340,14 +182802,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -162364,29 +182826,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162394,10 +182857,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162405,12 +182868,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -162421,6 +182886,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -162428,6 +182894,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162447,8 +182914,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -162466,37 +182933,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1032 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 + SolutionIndex: 1155 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162504,7 +182969,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -162513,7 +182978,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -162528,29 +182993,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCB: 8 + LSPA: 8 + LSPB: 128 LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162559,9 +183025,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162569,13 +183035,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -162585,13 +183051,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162611,8 +183079,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -162630,20 +183098,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1033 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG32_4_4_WGM1 + SolutionIndex: 1156 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -162651,16 +183119,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162668,7 +183136,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -162692,29 +183160,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6272 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -162722,10 +183191,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -162733,8 +183202,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -162749,6 +183218,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -162756,6 +183226,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162775,8 +183246,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -162794,37 +183265,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1034 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM1 + SolutionIndex: 1157 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -162832,63 +183303,60 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCA: 32 + LSCB: 8 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -162897,11 +183365,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -162913,13 +183383,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -162939,8 +183411,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -162958,37 +183430,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1035 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1158 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163002,50 +183472,55 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -163058,11 +183533,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -163073,13 +183550,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163099,8 +183578,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -163118,37 +183597,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1036 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR1_SNLL0_TT4_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1159 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163164,52 +183641,53 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6272 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -163222,11 +183700,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -163237,6 +183715,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -163244,6 +183723,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163263,8 +183743,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -163282,29 +183762,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1037 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1160 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -163312,7 +183792,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163328,56 +183808,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -163385,8 +183866,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -163401,6 +183882,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -163408,6 +183890,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163427,8 +183910,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -163446,28 +183929,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1038 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1161 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -163476,7 +183959,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163491,42 +183974,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -163538,9 +184022,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -163549,12 +184033,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -163565,6 +184051,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -163572,6 +184059,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163591,8 +184079,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -163610,8 +184098,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1039 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1162 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -163619,28 +184107,26 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163648,16 +184134,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -163668,33 +184154,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -163714,7 +184201,9 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -163729,6 +184218,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -163736,6 +184226,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163755,8 +184246,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -163774,37 +184265,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1040 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 + SolutionIndex: 1163 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163812,15 +184301,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -163828,29 +184317,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -163861,7 +184355,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -163874,11 +184368,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -163889,13 +184385,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -163915,8 +184413,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -163934,8 +184432,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1041 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM1 + SolutionIndex: 1164 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -163954,17 +184452,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -163978,39 +184474,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -164034,11 +184535,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -164049,13 +184552,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164075,8 +184580,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -164094,8 +184599,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1042 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM8 + SolutionIndex: 1165 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -164114,17 +184619,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164149,46 +184652,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -164197,11 +184701,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -164213,6 +184717,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -164220,6 +184725,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164239,8 +184745,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -164258,14 +184764,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1043 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1166 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -164279,7 +184785,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -164288,7 +184794,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164296,7 +184802,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -164304,7 +184810,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -164312,37 +184818,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -164350,9 +184857,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -164361,12 +184868,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -164377,6 +184884,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -164384,6 +184892,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164403,8 +184912,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -164422,14 +184931,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1044 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1167 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -164442,17 +184951,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164460,45 +184969,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 - LVCB: 4 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -164509,10 +185023,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -164521,11 +185035,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -164537,13 +185051,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164563,8 +185079,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -164582,37 +185098,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1045 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1168 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164620,49 +185136,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -164670,9 +185191,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -164681,8 +185202,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -164697,13 +185220,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164723,8 +185248,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -164742,14 +185267,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1046 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1169 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] @@ -164758,21 +185283,19 @@ ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164780,59 +185303,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -164841,12 +185369,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -164857,13 +185387,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -164883,8 +185415,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -164902,37 +185434,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1047 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1170 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -164940,59 +185470,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -165001,12 +185536,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -165017,13 +185554,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165043,8 +185582,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -165062,37 +185601,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1048 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1171 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165100,13 +185637,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -165124,25 +185661,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 528 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -165150,9 +185692,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -165161,12 +185703,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -165177,13 +185719,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165203,8 +185747,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -165222,20 +185766,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1049 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1172 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -165243,16 +185787,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165260,23 +185804,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -165284,25 +185828,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCB: 32 + LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -165337,13 +185886,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165363,8 +185914,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -165382,20 +185933,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1050 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1173 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -165403,16 +185954,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165420,23 +185971,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -165444,25 +185995,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 - LSCB: 16 - LSPA: 16 + LSCB: 32 + LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -165471,9 +186027,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 16 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 16 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -165481,12 +186037,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -165497,13 +186055,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165523,8 +186083,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -165542,37 +186102,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1051 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1174 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR0_TT4_4_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165580,16 +186138,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -165600,33 +186158,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 + LVCB: 32 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3104 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -165635,9 +186194,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -165645,11 +186204,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -165661,6 +186222,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -165668,6 +186230,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165687,8 +186250,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -165706,20 +186269,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1052 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1175 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -165727,16 +186290,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165744,60 +186305,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -165805,11 +186371,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -165821,13 +186389,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -165847,8 +186417,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -165866,15 +186436,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1053 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1176 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO1_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [2, 4] ThreadTile0: 2 @@ -165882,21 +186452,19 @@ ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -165910,37 +186478,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -165954,9 +186527,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -165965,12 +186538,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -165981,13 +186556,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166007,8 +186584,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -166026,37 +186603,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1054 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1177 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166071,7 +186646,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -166084,27 +186659,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -166118,9 +186694,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -166129,11 +186705,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -166145,13 +186723,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166171,8 +186751,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -166190,37 +186770,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1055 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1178 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166228,49 +186806,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -166281,10 +186860,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -166293,11 +186872,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -166309,6 +186890,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -166316,6 +186898,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166335,8 +186918,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -166354,37 +186937,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1056 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1179 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166392,53 +186973,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -166446,10 +187024,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -166457,13 +187035,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166473,13 +187054,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166499,8 +187082,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -166518,37 +187101,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1057 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM16 + SolutionIndex: 1180 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166556,53 +187137,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -166610,10 +187188,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -166621,13 +187199,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166637,13 +187218,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166663,8 +187246,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -166682,37 +187265,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1058 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM16 + SolutionIndex: 1181 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166720,16 +187301,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -166744,40 +187325,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -166785,13 +187367,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166801,6 +187386,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -166808,6 +187394,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166827,8 +187414,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -166846,16 +187433,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1059 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM1 + SolutionIndex: 1182 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -166867,16 +187454,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -166884,16 +187469,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -166908,29 +187493,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 32 - LVCB: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -166938,10 +187524,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -166949,13 +187535,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -166965,6 +187554,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -166972,6 +187562,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -166991,8 +187582,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -167010,16 +187601,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1060 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM1 + SolutionIndex: 1183 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -167031,16 +187622,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167048,7 +187637,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -167056,7 +187645,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -167064,37 +187653,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -167103,9 +187693,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -167113,13 +187703,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167129,6 +187720,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -167136,6 +187728,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167155,8 +187748,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -167174,16 +187767,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1061 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM8 + SolutionIndex: 1184 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -167194,17 +187787,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167212,7 +187805,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -167221,7 +187814,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -167236,40 +187829,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -167277,13 +187871,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167293,6 +187888,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -167300,6 +187896,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167319,8 +187916,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -167338,16 +187935,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1062 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM16 + SolutionIndex: 1185 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -167359,16 +187956,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -167376,15 +187973,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -167392,37 +187989,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 32 - LVCB: 16 - LVPA: 4 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -167430,10 +188028,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -167441,13 +188039,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167457,6 +188058,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -167464,6 +188066,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -167483,8 +188086,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -167502,33 +188105,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1063 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM16 + SolutionIndex: 1186 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -167540,48 +188141,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 96 + LSCB: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 + LVCA: 48 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -167594,10 +188195,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -167606,15 +188207,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167652,8 +188252,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -167671,8 +188271,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1064 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1187 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -167681,21 +188281,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -167707,44 +188309,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 16 + LVCA: 64 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -167761,10 +188363,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -167773,13 +188375,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167817,8 +188422,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -167836,8 +188441,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1065 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1188 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -167846,23 +188451,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -167881,8 +188484,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -167890,32 +188493,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 128 LSCB: 8 - LSPA: 5 + LSPA: 8 LSPB: 64 - LVCA: 48 + LVCA: 32 LVCB: 4 - LVPA: 3 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -167929,9 +188532,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -167940,13 +188543,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -167958,7 +188564,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -167984,8 +188590,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -168003,8 +188609,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1066 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1189 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168013,23 +188619,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -168048,8 +188652,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -168057,32 +188661,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 128 LSCB: 8 - LSPA: 5 + LSPA: 8 LSPB: 64 - LVCA: 48 + LVCA: 32 LVCB: 4 - LVPA: 3 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -168096,9 +188700,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168107,13 +188711,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168125,7 +188732,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -168151,8 +188758,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -168170,8 +188777,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1067 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1190 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168180,23 +188787,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -168208,48 +188813,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 + LSCA: 128 + LSCB: 16 + LSPA: 8 LSPB: 64 - LVCA: 48 + LVCA: 32 LVCB: 4 - LVPA: 3 - LVPB: 32 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -168262,10 +188867,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 96 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168274,13 +188879,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168318,8 +188926,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -168337,8 +188945,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1068 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1191 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168347,23 +188955,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -168383,7 +188989,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -168391,32 +188997,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 128 LSCB: 8 - LSPA: 5 + LSPA: 8 LSPB: 64 - LVCA: 48 + LVCA: 32 LVCB: 4 - LVPA: 3 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -168430,9 +189036,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -168441,13 +189047,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168485,8 +189092,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -168504,8 +189111,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1069 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1192 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168514,22 +189121,22 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -168542,15 +189149,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -168558,32 +189165,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6656 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -168596,11 +189203,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -168608,15 +189215,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168654,8 +189260,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -168673,8 +189279,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1070 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1193 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168683,21 +189289,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -168715,42 +189323,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 96 + LSPB: 32 LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -168764,10 +189368,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -168775,13 +189379,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168792,8 +189399,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -168819,8 +189426,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -168838,8 +189445,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1071 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1194 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -168848,13 +189455,13 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -168863,8 +189470,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -168882,7 +189487,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -168902,24 +189507,20 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 2 + LSPA: 4 LSPB: 32 - LVCA: 128 + LVCA: 64 LVCB: 8 - LVPA: 2 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 784 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -168931,10 +189532,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -168942,15 +189543,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -168961,8 +189563,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -168988,8 +189590,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -169007,8 +189609,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1072 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_WG16_16_1_WGM8 + SolutionIndex: 1195 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR0_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -169017,11 +189619,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -169029,7 +189631,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -169043,23 +189645,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -169069,24 +189671,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 + LSCA: 32 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 64 + LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 32 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -169097,10 +189699,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169109,15 +189711,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169129,7 +189730,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -169155,8 +189756,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -169174,8 +189775,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1073 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1196 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -169184,10 +189785,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -169198,7 +189799,9 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -169210,50 +189813,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 + LSCA: 64 + LSCB: 16 LSPA: 4 - LSPB: 64 + LSPB: 16 LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -169264,10 +189863,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169276,15 +189875,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169295,7 +189895,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -169322,8 +189922,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -169341,8 +189941,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1074 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1197 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -169351,21 +189951,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -169377,50 +189977,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -169431,10 +190027,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169443,15 +190039,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169462,8 +190059,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -169489,8 +190086,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -169508,8 +190105,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1075 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1198 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -169518,21 +190115,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -169544,16 +190141,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -169570,24 +190167,20 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -169598,10 +190191,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169610,15 +190203,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169629,7 +190223,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -169656,8 +190250,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -169675,8 +190269,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1076 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1199 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -169685,10 +190279,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -169697,9 +190291,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -169711,16 +190305,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -169737,24 +190331,20 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -169765,10 +190355,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -169777,15 +190367,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169796,7 +190385,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -169823,8 +190412,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -169842,8 +190431,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1077 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1200 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -169852,10 +190441,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -169864,9 +190453,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -169884,59 +190475,55 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 16 LSPA: 8 - LSPB: 64 + LSPB: 16 LVCA: 32 - LVCB: 4 - LVPA: 2 + LVCB: 16 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -169944,8 +190531,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -169953,6 +190540,7 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -169963,7 +190551,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -169990,8 +190578,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -170009,29 +190597,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1078 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1201 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -170045,15 +190633,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -170061,38 +190649,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170100,10 +190688,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170111,13 +190699,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170129,7 +190720,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -170155,8 +190746,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -170174,33 +190765,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1079 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1202 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -170212,7 +190801,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -170220,7 +190809,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -170228,38 +190817,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170267,10 +190856,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170278,13 +190867,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170322,8 +190912,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -170341,31 +190931,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1080 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1203 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -170379,16 +190969,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -170399,34 +190989,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170434,10 +191024,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170445,13 +191035,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170463,7 +191056,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -170489,8 +191082,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -170508,33 +191101,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1081 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1204 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW4_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -170546,54 +191137,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 + LSCA: 32 + LSCB: 32 LSPA: 8 - LSPB: 64 + LSPB: 8 LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3392 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170601,10 +191192,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170612,13 +191203,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170630,7 +191224,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -170656,8 +191250,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -170675,33 +191269,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1082 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1205 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -170713,7 +191305,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -170721,7 +191313,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -170729,38 +191321,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170768,10 +191360,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170779,13 +191371,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170823,8 +191416,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -170842,31 +191435,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1083 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1206 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -170880,14 +191473,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -170906,28 +191499,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -170935,10 +191528,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -170946,13 +191539,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -170990,8 +191586,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -171009,20 +191605,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1084 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1207 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -171030,16 +191626,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171047,48 +191641,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -171101,10 +191691,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -171113,13 +191703,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171130,8 +191723,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -171157,8 +191750,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -171176,8 +191769,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1085 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1208 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -171186,23 +191779,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -171221,41 +191812,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 - LVPA: 2 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -171269,10 +191860,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -171280,15 +191871,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171326,8 +191916,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -171345,8 +191935,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1086 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_WG16_16_1_WGM8 + SolutionIndex: 1209 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -171355,25 +191945,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171389,36 +191981,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -171437,9 +192029,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -171447,15 +192039,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171467,7 +192060,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -171493,8 +192086,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -171512,8 +192105,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1087 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1210 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -171522,17 +192115,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -171583,9 +192176,9 @@ LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -171604,9 +192197,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -171614,15 +192207,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171660,8 +192254,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -171679,8 +192273,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1088 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1211 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -171689,11 +192283,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -171701,13 +192295,13 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171721,42 +192315,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 32 LVCA: 32 - LVCB: 2 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -171770,10 +192360,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -171781,8 +192371,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -171790,6 +192380,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171800,7 +192391,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -171827,8 +192418,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -171846,8 +192437,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1089 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1212 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -171856,25 +192447,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -171890,42 +192481,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 32 LVCA: 32 - LVCB: 2 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1544 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -171937,10 +192528,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -171948,8 +192539,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -171957,6 +192548,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -171994,8 +192586,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -172013,8 +192605,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1090 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1213 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB1_PGR1_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172023,25 +192615,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172055,44 +192647,40 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 32 LVCA: 32 - LVCB: 2 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 520 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -172104,10 +192692,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -172115,8 +192703,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -172124,6 +192712,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172134,8 +192723,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -172161,8 +192750,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -172180,8 +192769,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1091 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1214 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172190,25 +192779,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172216,50 +192805,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 + LSCA: 32 + LSCB: 16 LSPA: 8 - LSPB: 128 + LSPB: 16 LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1040 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -172270,11 +192855,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -172282,15 +192867,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172301,8 +192887,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -172328,8 +192914,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -172347,8 +192933,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1092 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1215 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172357,25 +192943,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172383,7 +192969,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -172391,40 +192977,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 + LSCA: 32 + LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -172437,11 +193023,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -172449,15 +193035,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172469,7 +193056,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -172495,8 +193082,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -172514,8 +193101,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1093 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1216 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR0_TT2_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -172524,21 +193111,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -172557,58 +193144,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 32 LVCA: 32 - LVCB: 2 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -172616,13 +193203,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172634,7 +193224,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -172660,8 +193250,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -172679,37 +193269,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1094 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1217 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172717,65 +193305,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -172783,13 +193367,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172800,7 +193387,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -172827,8 +193414,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -172846,37 +193433,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1095 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1218 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_AMAS1_EPS0_FL0_GRVW1_GSU8_LPB1_PGR0_PLR1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -172884,7 +193469,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -172893,33 +193478,33 @@ ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCA: 16 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 32 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 2048 LdsOffsetA: 0 LdsOffsetB: 256 LdsPadA: 0 @@ -172927,17 +193512,17 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -172946,7 +193531,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 @@ -172955,6 +193540,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -172966,7 +193552,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -172992,8 +193578,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -173011,31 +193597,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1096 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_WG16_16_1_WGM1 + SolutionIndex: 1219 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_AMAS1_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 1 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -173054,8 +193640,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -173063,49 +193649,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -173114,14 +193700,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173159,8 +193744,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: true @@ -173178,31 +193763,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1097 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 + SolutionIndex: 1220 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -173214,54 +193801,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -173269,24 +193856,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173324,9 +193916,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -173343,33 +193936,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1098 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 + SolutionIndex: 1221 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -173381,54 +193972,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -173436,24 +194027,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173491,9 +194087,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -173510,33 +194107,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1099 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1222 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -173556,66 +194151,68 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -173623,6 +194220,7 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173660,9 +194258,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -173679,29 +194278,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1100 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM1 + SolutionIndex: 1223 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -173722,9 +194321,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -173735,61 +194334,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3104 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173827,9 +194427,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -173846,31 +194447,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1101 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1224 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -173889,9 +194492,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -173911,52 +194514,53 @@ LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -173994,9 +194598,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -174013,15 +194618,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1102 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1225 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -174034,10 +194639,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -174049,15 +194656,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -174069,61 +194676,62 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 8 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174135,7 +194743,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -174161,9 +194769,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -174180,31 +194789,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1103 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1226 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -174216,7 +194827,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -174225,14 +194836,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -174243,52 +194854,55 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 8 LSPA: 8 - LSPB: 16 + LSPB: 96 LVCA: 32 - LVCB: 16 + LVCB: 2 LVPA: 4 - LVPB: 8 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174326,9 +194940,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -174345,31 +194960,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1104 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1227 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -174383,7 +194998,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -174391,8 +195006,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -174403,34 +195018,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 16 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -174439,23 +195054,26 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174493,9 +195111,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -174512,31 +195131,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1105 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1228 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -174550,16 +195169,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -174576,53 +195195,58 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174634,7 +195258,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -174660,9 +195284,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -174679,20 +195304,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1106 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1229 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -174700,12 +195325,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -174717,7 +195340,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -174725,8 +195348,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -174737,34 +195360,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 32 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -174772,19 +195395,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -174792,6 +195417,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174803,7 +195429,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -174829,9 +195455,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -174848,31 +195475,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1107 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_WG4_16_4_WGM1 + SolutionIndex: 1230 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -174884,7 +195511,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -174892,46 +195519,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -174939,26 +195566,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -174996,9 +195626,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -175015,31 +195646,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1108 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM1 + SolutionIndex: 1231 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -175051,7 +195682,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -175059,46 +195690,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -175106,26 +195737,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175163,9 +195797,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -175182,31 +195817,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1109 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM8 + SolutionIndex: 1232 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -175218,79 +195853,84 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175328,9 +195968,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -175347,33 +195988,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1110 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1233 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -175385,7 +196024,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -175394,14 +196033,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -175411,28 +196050,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -175440,24 +196079,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175495,9 +196137,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -175514,31 +196157,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1111 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1234 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -175552,16 +196195,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -175572,34 +196215,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -175607,26 +196250,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175664,9 +196308,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -175683,31 +196328,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1112 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR0_TT4_4_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1235 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -175719,54 +196366,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -175774,26 +196421,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175805,7 +196453,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -175831,9 +196479,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -175850,31 +196499,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1113 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1236 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -175886,54 +196537,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 64 LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -175941,26 +196592,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -175972,7 +196624,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -175998,9 +196650,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -176017,31 +196670,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1114 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO1_WG16_4_4_WGM8 + SolutionIndex: 1237 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -176053,54 +196708,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 128 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 64 LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -176108,26 +196763,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176165,9 +196821,183 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1238 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -176184,8 +197014,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1115 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_WG16_4_4_WGM8 + SolutionIndex: 1239 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -176194,21 +197024,21 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -176228,7 +197058,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -176236,34 +197066,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -176275,18 +197105,20 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 NumLoadsB: 1 @@ -176295,6 +197127,7 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176306,7 +197139,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -176332,9 +197165,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -176351,8 +197185,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1116 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1240 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -176361,17 +197195,17 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -176387,16 +197221,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -176414,21 +197248,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -176441,7 +197275,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 16 MacroTileA: 64 @@ -176449,19 +197283,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -176499,9 +197334,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -176518,8 +197354,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1117 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1241 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -176542,7 +197378,9 @@ WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [704, 1024, 1, 128] - [102, 3019.46] @@ -179022,8 +199860,6 @@ - [212, 8995.84] - - [4096, 512, 1, 2048] - [207, 9298.08] - - - [512, 256, 1, 2048] - - [200, 5186.16] - - [4096, 1024, 1, 2048] - [189, 9790.67] - - [2048, 1024, 1, 2048] @@ -182512,5972 +203348,6502 @@ - [524, 10427.3] - - [1024, 1, 1, 13] - [537, 0.0] + - - [768, 512, 1, 768] + - [561, 5889.04] + - - [768, 2048, 1, 3072] + - [571, 9394.62] + - - [768, 32, 1, 768] + - [583, 1502.74] + - - [64, 128, 96, 128] + - [578, 4973.48] + - - [3072, 1024, 1, 768] + - [572, 9856.07] + - - [768, 1024, 1, 3072] + - [565, 8611.06] + - - [768, 512, 1, 3072] + - [564, 6430.79] + - - [768, 64, 1, 768] + - [585, 2621.44] + - - [768, 4096, 1, 3072] + - [570, 10030.4] + - - [768, 2048, 1, 2] + - [563, 381.763] + - - [768, 2048, 1, 768] + - [568, 9754.2] + - - [768, 320, 1, 30522] + - [581, 8529.4] + - - [64, 64, 96, 64] + - [575, 2496.61] + - - [768, 640, 1, 30522] + - [562, 8253.84] + - - [768, 1280, 1, 30522] + - [567, 9572.85] + - - [768, 1280, 1, 768] + - [571, 8713.93] + - - [768, 640, 1, 768] + - [561, 7293.03] + - - [768, 32, 1, 2] + - [573, 11.8154] + - - [3072, 2048, 1, 768] + - [568, 10019.6] + - - [768, 4096, 1, 768] + - [568, 9927.35] + - - [3072, 4096, 1, 768] + - [571, 10150.1] + - - [64, 256, 192, 256] + - [577, 7054.19] + - - [768, 8, 1, 768] + - [584, 340.939] + - - [64, 128, 384, 128] + - [576, 6765.01] + - - [768, 1024, 1, 768] + - [566, 8768.58] + - - [768, 320, 1, 768] + - [582, 6838.54] + - - [64, 64, 768, 64] + - [579, 5388.83] + - - [768, 1024, 1, 2] + - [559, 258.695] + - - [768, 16, 1, 768] + - [584, 819.2] + - - [64, 256, 96, 256] + - [577, 5893.64] + - - [3072, 512, 1, 768] + - [569, 9722.79] + - - [768, 160, 1, 768] + - [586, 5019.78] + - - [768, 4096, 1, 2] + - [560, 507.375] + - - [1600, 512, 1, 1024] + - [590, 7186.95] + - - [1024, 512, 1, 64] + - [588, 2557.5] + - - [1024, 512, 1, 1] + - [587, 71.2348] + - - [2048, 512, 1, 1] + - [589, 90.3945] + - - [1024, 200, 1, 1] + - [595, 40.0] + - - [32, 200, 1, 1] + - [591, 1.56863] + - - [560, 200, 1, 1024] + - [599, 4731.35] + - - [1, 512, 1, 1] + - [598, 0.130612] + - - [64, 512, 1, 1] + - [593, 7.58519] + - - [1024, 8192, 1, 256] + - [608, 9518.99] + - - [1024, 22016, 1, 256] + - [614, 9881.12] + - - [256, 8976, 1, 4352] + - [606, 9567.08] + - - [512, 256, 1, 2048] + - [619, 5917.89] + - - [1024, 19968, 1, 256] + - [614, 9882.37] + - - [256, 8976, 1, 1536] + - [604, 8437.35] + - - [256, 8976, 1, 33536] + - [604, 8441.89] + - - [1024, 1792, 1, 256] + - [604, 7756.97] + - - [1024, 21504, 1, 256] + - [614, 9893.9] + - - [512, 215, 1, 2048] + - [620, 4665.64] + - - [1024, 7168, 1, 256] + - [608, 9509.35] + - - [256, 8976, 1, 15872] + - [610, 8914.65] + - - [1024, 19712, 1, 256] + - [614, 9771.9] + - - [256, 8976, 1, 5632] + - [610, 8740.03] + - - [1024, 14848, 1, 256] + - [614, 9756.15] + - - [1024, 28672, 1, 256] + - [614, 9958.92] + - - [256, 8976, 1, 9728] + - [617, 8853.04] + - - [1024, 17152, 1, 256] + - [608, 9737.3] + - - [256, 8976, 1, 11520] + - [610, 8999.2] + - - [256, 8976, 1, 8192] + - [600, 7897.32] + - - [1024, 3328, 1, 256] + - [615, 8593.53] + - - [256, 8976, 1, 7424] + - [610, 8980.47] + - - [1024, 18944, 1, 256] + - [614, 9854.85] + - - [1024, 10496, 1, 256] + - [609, 9453.9] + - - [256, 8976, 1, 5376] + - [607, 9608.37] + - - [256, 8976, 1, 6144] + - [604, 7880.13] + - - [1024, 40448, 1, 256] + - [614, 10016.6] + - - [256, 8976, 1, 22016] + - [617, 8939.87] + - - [256, 8976, 1, 4864] + - [605, 9211.43] + - - [256, 8976, 1, 12288] + - [601, 8065.05] + - - [1024, 9728, 1, 256] + - [614, 9636.25] + - - [256, 8976, 1, 2048] + - [602, 7001.33] + - - [1024, 10240, 1, 256] + - [608, 9619.96] + - - [256, 8976, 1, 2304] + - [606, 9509.74] + - - [1024, 7936, 1, 256] + - [614, 9300.67] + - - [768, 256, 1, 2048] + - [618, 6267.95] + - - [1024, 9984, 1, 256] + - [614, 9477.28] + - - [1024, 13312, 1, 256] + - [614, 9758.56] + - - [1024, 16128, 1, 256] + - [608, 9721.9] + - - [1024, 8960, 1, 256] + - [609, 9398.25] + - - [1024, 5120, 1, 256] + - [615, 9315.5] + - - [1024, 11264, 1, 256] + - [608, 9664.8] + - - [256, 8976, 1, 20480] + - [616, 8279.87] + - - [1024, 20992, 1, 256] + - [608, 9878.87] + - - [256, 8976, 1, 9472] + - [610, 8990.96] + - - [256, 8976, 1, 8448] + - [610, 8983.52] + - - [256, 8976, 1, 20992] + - [611, 8942.11] + - - [256, 8976, 1, 10496] + - [611, 8989.71] + - - [1024, 15104, 1, 256] + - [609, 9676.01] + - - [1024, 6400, 1, 256] + - [617, 9145.89] + - - [1024, 4096, 1, 256] + - [610, 9124.25] + - - [256, 8976, 1, 2560] + - [604, 8566.11] + - - [256, 8976, 1, 2816] + - [606, 9496.84] + - - [1024, 7680, 1, 256] + - [614, 9460.84] + - - [256, 8976, 1, 14336] + - [611, 8226.8] + - - [256, 8976, 1, 6656] + - [611, 8771.42] + - - [1024, 3072, 1, 256] + - [611, 9076.94] + - - [256, 8976, 1, 5888] + - [607, 9546.3] + - - [1024, 12288, 1, 256] + - [608, 9690.81] + - - [256, 8976, 1, 26112] + - [613, 8699.83] + - - [1024, 7424, 1, 256] + - [615, 9256.84] + - - [256, 8976, 1, 14848] + - [616, 8885.79] + - - [768, 215, 1, 2048] + - [618, 5628.59] + - - [1024, 2560, 1, 256] + - [611, 8820.83] + - - [256, 8976, 1, 19968] + - [610, 8928.86] + - - [256, 8976, 1, 9984] + - [610, 8993.12] + - - [1024, 4864, 1, 256] + - [611, 8974.3] + - - [1024, 33536, 1, 256] + - [614, 9943.07] + - - [256, 8976, 1, 15104] + - [611, 8996.63] + - - [1024, 2048, 1, 256] + - [609, 8462.66] + - - [256, 8976, 1, 8960] + - [611, 8998.92] + - - [1024, 6144, 1, 256] + - [616, 9359.67] + - - [1024, 14592, 1, 256] + - [614, 9667.42] + - - [256, 8976, 1, 19712] + - [610, 9020.11] + - - [1024, 11520, 1, 256] + - [609, 9527.7] + - - [1024, 5632, 1, 256] + - [608, 9297.2] + - - [256, 8976, 1, 11008] + - [617, 8994.8] + - - [256, 8976, 1, 17152] + - [611, 9003.8] + - - [256, 8976, 1, 3072] + - [600, 8261.96] + - - [1024, 3840, 1, 256] + - [617, 8671.89] + - - [1024, 14336, 1, 256] + - [614, 9760.28] + - - [1024, 20480, 1, 256] + - [608, 9887.85] + - - [1024, 23552, 1, 256] + - [608, 9890.46] + - - [256, 8976, 1, 7168] + - [603, 8478.34] + - - [1024, 13568, 1, 256] + - [608, 9654.64] + - - [1024, 4608, 1, 256] + - [616, 9218.25] + - - [256, 8976, 1, 10240] + - [601, 8076.16] + - - [1024, 8704, 1, 256] + - [610, 9475.5] + - - [1024, 11008, 1, 256] + - [614, 9524.96] + - - [1024, 8448, 1, 256] + - [608, 9352.16] + - - [256, 8976, 1, 44505] + - [612, 8430.23] - - [704, 1024, 1, 128] - - [661, 3019.56] + - [723, 3019.56] - - [1024, 1024, 1, 3328] - - [699, 8162.65] + - [761, 8162.65] - - [4, 704, 1, 1280] - - [602, 319.646] + - [664, 319.646] - - [4, 1856, 1, 3328] - - [632, 550.614] + - [694, 550.614] - - [1856, 448, 1, 3328] - - [684, 6813.15] + - [746, 6813.15] - - [2944, 4288, 1, 1280] - - [693, 8975.86] + - [755, 8975.86] - - [2368, 64, 1, 3328] - - [607, 5482.33] + - [669, 5482.33] - - [1760, 32, 1, 1760] - - [646, 3860.04] + - [708, 3860.04] - - [2368, 5888, 1, 256] - - [690, 8656.83] + - [752, 8656.83] - - [5888, 1856, 1, 256] - - [680, 7881.53] + - [742, 7881.53] - - [64, 3584, 1, 1280] - - [616, 4835.43] + - [678, 4835.43] - - [512, 24000, 1, 1536] - - [687, 8666.0] + - [749, 8666.0] - - [128, 6784, 1, 3328] - - [684, 7062.35] + - [746, 7062.35] - - [5888, 1408, 1, 256] - - [697, 8130.32] + - [759, 8130.32] - - [5888, 1856, 1, 3328] - - [687, 8840.85] + - [749, 8840.85] - - [512, 4, 1, 512] - - [572, 170.323] + - [634, 170.323] - - [35, 1500, 1, 2560] - - [576, 2896.65] + - [638, 2896.65] - - [1856, 4288, 1, 256] - - [676, 8374.73] + - [738, 8374.73] - - [1024, 5056, 1, 128] - - [673, 3304.35] + - [735, 3304.35] - - [5056, 5056, 1, 3328] - - [687, 8905.53] + - [749, 8905.53] - - [1408, 5888, 1, 1280] - - [687, 9418.2] + - [749, 9418.2] - - [2368, 448, 1, 128] - - [661, 3075.07] + - [723, 3075.07] - - [6144, 6000, 1, 2560] - - [687, 9336.43] + - [749, 9336.43] - - [2368, 6784, 1, 128] - - [660, 4919.36] + - [722, 4919.36] - - [1024, 3584, 1, 3328] - - [678, 8071.17] + - [740, 8071.17] - - [512, 48000, 1, 2048] - - [687, 8763.16] + - [749, 8763.16] - - [1408, 64, 1, 128] - - [583, 805.57] + - [645, 805.57] - - [256, 4288, 1, 3328] - - [709, 6331.96] + - [771, 6331.96] - - [5888, 1408, 1, 1280] - - [677, 9226.27] + - [739, 9226.27] - - [704, 1856, 1, 3328] - - [703, 6309.5] + - [765, 6309.5] - - [1408, 4288, 1, 256] - - [687, 8374.6] + - [749, 8374.6] - - [1024, 2368, 1, 256] - - [684, 7341.12] + - [746, 7341.12] - - [64, 4, 1, 256] - - [627, 13.1032] + - [689, 13.1032] - - [1408, 1856, 1, 1280] - - [694, 8773.05] + - [756, 8773.05] - - [1408, 64, 1, 1280] - - [640, 4050.08] + - [702, 4050.08] - - [448, 1024, 1, 1280] - - [703, 6071.26] + - [765, 6071.26] - - [4096, 32, 1, 4096] - - [637, 5491.82] + - [699, 5491.82] - - [256, 1408, 1, 3328] - - [689, 5351.49] + - [751, 5351.49] - - [5056, 5056, 1, 1280] - - [697, 9408.67] + - [759, 9408.67] - - [448, 5056, 1, 256] - - [702, 6680.54] + - [764, 6680.54] - - [704, 1856, 1, 1280] - - [679, 7504.03] + - [741, 7504.03] - - [128, 5056, 1, 128] - - [594, 2316.58] + - [656, 2316.58] - - [2368, 128, 1, 256] - - [679, 3660.22] + - [741, 3660.22] - - [1856, 1408, 1, 128] - - [666, 3885.97] + - [728, 3885.97] - - [64, 5056, 1, 256] - - [689, 3318.91] + - [751, 3318.91] - - [6784, 256, 1, 3328] - - [687, 7590.64] + - [749, 7590.64] - - [1408, 3584, 1, 256] - - [676, 8276.92] + - [738, 8276.92] - - [4288, 448, 1, 256] - - [689, 7139.79] + - [751, 7139.79] - - [64, 704, 1, 128] - - [590, 375.567] + - [652, 375.567] - - [1024, 1856, 1, 128] - - [659, 2890.66] + - [721, 2890.66] - - [4288, 2944, 1, 1280] - - [693, 8981.45] + - [755, 8981.45] - - [704, 5056, 1, 1280] - - [679, 7684.72] + - [741, 7684.72] - - [2368, 704, 1, 3328] - - [694, 7070.14] + - [756, 7070.14] - - [256, 5888, 1, 256] - - [679, 7319.45] + - [741, 7319.45] - - [1856, 4288, 1, 3328] - - [677, 9238.69] + - [739, 9238.69] - - [256, 2944, 1, 256] - - [679, 6090.31] + - [741, 6090.31] - - [5888, 1024, 1, 256] - - [683, 8270.05] + - [745, 8270.05] - - [448, 64, 1, 1280] - - [636, 2493.32] + - [698, 2493.32] - - [3072, 64, 1, 1024] - - [619, 3149.77] + - [681, 3149.77] - - [3584, 4, 1, 1280] - - [721, 567.862] + - [783, 567.862] - - [2560, 16, 1, 2560] - - [628, 2887.15] + - [690, 2887.15] - - [2944, 64, 1, 256] - - [619, 2565.76] + - [681, 2565.76] - - [128, 4, 1, 1280] - - [722, 78.8692] + - [784, 78.8692] - - [1408, 2944, 1, 256] - - [683, 8337.3] + - [745, 8337.3] - - [256, 1856, 1, 1280] - - [709, 6267.35] + - [771, 6267.35] - - [6784, 5056, 1, 3328] - - [693, 9424.0] + - [755, 9424.0] - - [5056, 5056, 1, 256] - - [680, 8758.33] + - [742, 8758.33] - - [128, 256, 1, 256] - - [635, 1205.36] + - [697, 1205.36] - - [64, 1024, 1, 1280] - - [646, 3566.68] + - [708, 3566.68] - - [2944, 4, 1, 256] - - [599, 319.449] + - [661, 319.449] - - [704, 5056, 1, 128] - - [668, 4073.83] + - [730, 4073.83] - - [4, 2368, 1, 1280] - - [627, 496.992] + - [689, 496.992] - - [2368, 2944, 1, 1280] - - [676, 9085.55] + - [738, 9085.55] - - [448, 448, 1, 3328] - - [654, 5428.76] + - [716, 5428.76] - - [6784, 6784, 1, 1280] - - [693, 8727.03] + - [755, 8727.03] - - [1024, 256, 1, 3328] - - [703, 5499.42] + - [765, 5499.42] - - [1408, 4288, 1, 1280] - - [677, 9094.42] + - [739, 9094.42] - - [3584, 4288, 1, 1280] - - [680, 8703.88] + - [742, 8703.88] - - [512, 6000, 1, 2560] - - [683, 8474.56] + - [745, 8474.56] - - [2368, 704, 1, 1280] - - [689, 7651.59] + - [751, 7651.59] - - [5056, 4288, 1, 3328] - - [697, 8545.35] + - [759, 8545.35] - - [3584, 2368, 1, 3328] - - [685, 8797.88] + - [747, 8797.88] - - [5888, 6784, 1, 1280] - - [683, 8785.18] + - [745, 8785.18] - - [64, 704, 1, 1280] - - [606, 2783.48] + - [668, 2783.48] - - [4288, 256, 1, 256] - - [679, 6162.78] + - [741, 6162.78] - - [2944, 128, 1, 128] - - [581, 1951.33] + - [643, 1951.33] - - [6144, 32, 1, 2560] - - [640, 4589.05] + - [702, 4589.05] - - [6784, 448, 1, 1280] - - [684, 8674.31] + - [746, 8674.31] - - [2944, 5888, 1, 256] - - [697, 8991.76] + - [759, 8991.76] - - [64, 64, 1, 1280] - - [657, 712.448] + - [719, 712.448] - - [4288, 2944, 1, 256] - - [693, 8678.14] + - [755, 8678.14] - - [5888, 704, 1, 1280] - - [683, 8652.71] + - [745, 8652.71] - - [5056, 4, 1, 3328] - - [599, 650.772] + - [661, 650.772] - - [1856, 64, 1, 1280] - - [616, 4471.97] + - [678, 4471.97] - - [1760, 16, 1, 1760] - - [656, 2592.23] + - [718, 2592.23] - - [448, 5888, 1, 128] - - [666, 3823.03] + - [728, 3823.03] - - [5888, 64, 1, 3328] - - [648, 6013.22] + - [710, 6013.22] - - [2944, 256, 1, 3328] - - [689, 7791.45] + - [751, 7791.45] - - [1024, 64, 1, 128] - - [590, 592.516] + - [652, 592.516] - - [5056, 2368, 1, 1280] - - [676, 9260.53] + - [738, 9260.53] - - [448, 3584, 1, 1280] - - [697, 6771.34] + - [759, 6771.34] - - [6784, 5888, 1, 256] - - [691, 7933.39] + - [753, 7933.39] - - [64, 1024, 1, 3328] - - [640, 4783.08] + - [702, 4783.08] - - [704, 128, 1, 1280] - - [646, 3971.98] + - [708, 3971.98] - - [4, 3584, 1, 128] - - [715, 59.5238] + - [777, 59.5238] - - [1408, 448, 1, 1280] - - [689, 5902.17] + - [751, 5902.17] - - [1024, 1408, 1, 256] - - [684, 5272.94] + - [746, 5272.94] - - [2368, 2368, 1, 3328] - - [689, 8488.76] + - [751, 8488.76] - - [1856, 6784, 1, 128] - - [666, 4742.51] + - [728, 4742.51] - - [5056, 704, 1, 3328] - - [692, 7772.48] + - [754, 7772.48] - - [1408, 1856, 1, 256] - - [710, 5229.84] + - [772, 5229.84] - - [1408, 704, 1, 3328] - - [710, 6954.93] + - [772, 6954.93] - - [2368, 5056, 1, 256] - - [683, 8580.68] + - [745, 8580.68] - - [1408, 256, 1, 1280] - - [709, 4790.11] + - [771, 4790.11] - - [3072, 128, 1, 1024] - - [705, 4579.87] + - [767, 4579.87] - - [3584, 2368, 1, 1280] - - [676, 8675.13] + - [738, 8675.13] - - [4288, 64, 1, 3328] - - [655, 5550.11] + - [717, 5550.11] - - [2368, 4, 1, 1280] - - [721, 537.518] + - [783, 537.518] - - [704, 5888, 1, 256] - - [677, 5305.88] + - [739, 5305.88] - - [6784, 2944, 1, 128] - - [673, 4344.21] + - [735, 4344.21] - - [6784, 64, 1, 256] - - [703, 4496.42] + - [765, 4496.42] - - [2944, 256, 1, 256] - - [689, 6553.7] + - [751, 6553.7] - - [2944, 6784, 1, 3328] - - [677, 8895.76] + - [739, 8895.76] - - [128, 1, 1, 1408] - - [657, 25.7] + - [719, 25.7] - - [704, 1408, 1, 3328] - - [691, 7913.21] + - [753, 7913.21] - - [3584, 704, 1, 3328] - - [676, 7526.43] + - [738, 7526.43] - - [2944, 256, 1, 128] - - [660, 2830.76] + - [722, 2830.76] - - [6784, 4, 1, 1280] - - [717, 645.235] + - [779, 645.235] - - [1024, 64, 1, 1280] - - [615, 3013.25] + - [677, 3013.25] - - [8448, 4, 1, 2816] - - [567, 984.768] + - [629, 984.768] - - [448, 4288, 1, 256] - - [689, 7139.79] + - [751, 7139.79] - - [64, 3584, 1, 3328] - - [613, 5683.27] + - [675, 5683.27] - - [704, 2368, 1, 1280] - - [697, 7045.3] + - [759, 7045.3] - - [1856, 2368, 1, 1280] - - [694, 8327.9] + - [756, 8327.9] - - [2368, 128, 1, 3328] - - [630, 6082.65] + - [692, 6082.65] - - [64, 193600, 1, 64] - - [679, 6747.77] + - [741, 6747.77] - - [1760, 128, 1, 1760] - - [607, 5513.07] + - [669, 5513.07] - - [448, 1408, 1, 256] - - [689, 5591.54] + - [751, 5591.54] - - [1856, 4288, 1, 1280] - - [687, 8647.72] + - [749, 8647.72] - - [64, 5056, 1, 3328] - - [647, 6096.59] + - [709, 6096.59] - - [512, 1500, 1, 2816] - - [689, 7879.3] + - [751, 7879.3] - - [1024, 448, 1, 128] - - [661, 1844.33] + - [723, 1844.33] - - [704, 4, 1, 1280] - - [627, 341.433] + - [689, 341.433] - - [704, 256, 1, 128] - - [661, 1001.34] + - [723, 1001.34] - - [256, 193600, 1, 64] - - [697, 8113.3] + - [759, 8113.3] - - [704, 2944, 1, 128] - - [668, 3747.13] + - [730, 3747.13] - - [1408, 1024, 1, 1280] - - [694, 7080.71] + - [756, 7080.71] - - [704, 6784, 1, 256] - - [712, 6630.47] + - [774, 6630.47] - - [6784, 704, 1, 256] - - [679, 8005.86] + - [741, 8005.86] - - [5056, 1408, 1, 128] - - [670, 4303.13] + - [732, 4303.13] - - [2048, 7000, 1, 2048] - - [687, 9269.2] + - [749, 9269.2] - - [256, 3584, 1, 3328] - - [681, 7334.48] + - [743, 7334.48] - - [5056, 704, 1, 256] - - [689, 7954.12] + - [751, 7954.12] - - [128, 1408, 1, 128] - - [584, 1243.02] + - [646, 1243.02] - - [3584, 4288, 1, 3328] - - [713, 7683.81] + - [775, 7683.81] - - [5888, 1856, 1, 1280] - - [677, 8831.34] + - [739, 8831.34] - - [256, 1408, 1, 256] - - [679, 4352.68] + - [741, 4352.68] - - [5056, 64, 1, 1280] - - [646, 5012.05] + - [708, 5012.05] - - [1024, 704, 1, 256] - - [679, 5710.17] + - [741, 5710.17] - - [64, 256, 1, 128] - - [585, 149.897] + - [647, 149.897] - - [2368, 3584, 1, 1280] - - [687, 8609.68] + - [749, 8609.68] - - [1024, 256, 1, 256] - - [703, 3276.9] + - [765, 3276.9] - - [1856, 4, 1, 1280] - - [601, 497.104] + - [663, 497.104] - - [448, 448, 1, 256] - - [689, 3117.83] + - [751, 3117.83] - - [2944, 3584, 1, 3328] - - [677, 8879.45] + - [739, 8879.45] - - [7680, 32, 1, 2560] - - [647, 5310.24] + - [709, 5310.24] - - [128, 4288, 1, 128] - - [587, 2116.2] + - [649, 2116.2] - - [256, 256, 1, 3328] - - [640, 4774.7] + - [702, 4774.7] - - [128, 1024, 1, 3328] - - [641, 5894.8] + - [703, 5894.8] - - [4, 1408, 1, 3328] - - [632, 552.674] + - [694, 552.674] - - [196, 256, 64, 1024] - - [730, 5218.34] + - [792, 5218.34] - - [6784, 2944, 1, 256] - - [695, 8271.18] + - [757, 8271.18] - - [64, 1856, 1, 1280] - - [646, 4167.96] + - [708, 4167.96] - - [64, 1024, 1, 128] - - [580, 589.188] + - [642, 589.188] - - [1024, 1500, 1, 2560] - - [684, 8407.88] + - [746, 8407.88] - - [1856, 2368, 1, 256] - - [679, 8092.15] + - [741, 8092.15] - - [3584, 256, 1, 128] - - [662, 2607.57] + - [724, 2607.57] - - [3584, 6784, 1, 3328] - - [696, 8558.83] + - [758, 8558.83] - - [256, 1024, 1, 256] - - [689, 3901.78] + - [751, 3901.78] - - [4, 6784, 1, 3328] - - [627, 662.575] + - [689, 662.575] - - [1024, 5888, 1, 3328] - - [687, 9161.76] + - [749, 9161.76] - - [1024, 128, 1, 1280] - - [644, 3942.12] + - [706, 3942.12] - - [3072, 32, 1, 1024] - - [621, 2840.49] + - [683, 2840.49] - - [6144, 24000, 1, 2560] - - [677, 7605.87] + - [739, 7605.87] - - [448, 1024, 1, 256] - - [679, 5062.19] + - [741, 5062.19] - - [5056, 4288, 1, 1280] - - [687, 9090.99] + - [749, 9090.99] - - [5888, 64, 1, 256] - - [689, 4449.78] + - [751, 4449.78] - - [1856, 256, 1, 1280] - - [703, 5834.46] + - [765, 5834.46] - - [64, 5888, 1, 3328] - - [641, 6152.44] + - [703, 6152.44] - - [2368, 2368, 1, 1280] - - [681, 8594.66] + - [743, 8594.66] - - [2944, 5888, 1, 128] - - [666, 4776.19] + - [728, 4776.19] - - [704, 5888, 1, 1280] - - [681, 8435.91] + - [743, 8435.91] - - [2368, 3584, 1, 128] - - [663, 4590.71] + - [725, 4590.71] - - [1856, 5056, 1, 128] - - [674, 4503.48] + - [736, 4503.48] - - [4608, 1, 1, 1536] - - [572, 226.955] + - [634, 226.955] - - [448, 256, 1, 3328] - - [616, 5415.56] + - [678, 5415.56] - - [2944, 6784, 1, 1280] - - [700, 8385.11] + - [762, 8385.11] - - [448, 1856, 1, 128] - - [670, 2618.96] + - [732, 2618.96] - - [128, 1024, 1, 128] - - [579, 940.527] + - [641, 940.527] - - [7680, 4, 1, 2560] - - [603, 985.104] + - [665, 985.104] - - [1024, 704, 1, 1280] - - [689, 7204.56] + - [751, 7204.56] - - [128, 5888, 1, 256] - - [679, 6313.52] + - [741, 6313.52] - - [1024, 5056, 1, 1280] - - [684, 8979.76] + - [746, 8979.76] - - [4288, 1024, 1, 256] - - [676, 7198.29] + - [738, 7198.29] - - [2944, 2368, 1, 128] - - [661, 4624.57] + - [723, 4624.57] - - [704, 704, 1, 3328] - - [702, 5870.71] + - [764, 5870.71] - - [704, 1408, 1, 1280] - - [691, 7680.32] + - [753, 7680.32] - - [5888, 448, 1, 1280] - - [679, 7718.66] + - [741, 7718.66] - - [3584, 256, 1, 3328] - - [684, 7523.88] + - [746, 7523.88] - - [704, 5888, 1, 3328] - - [689, 8196.99] + - [751, 8196.99] - - [704, 1856, 1, 128] - - [667, 3388.43] + - [729, 3388.43] - - [128, 3584, 1, 3328] - - [641, 6626.5] + - [703, 6626.5] - - [4, 4288, 1, 128] - - [714, 159.648] + - [776, 159.648] - - [128, 704, 1, 1280] - - [604, 4038.73] + - [666, 4038.73] - - [3584, 2944, 1, 256] - - [677, 7685.99] + - [739, 7685.99] - - [1856, 128, 1, 3328] - - [633, 6070.63] + - [695, 6070.63] - - [1856, 2368, 1, 3328] - - [694, 8460.62] + - [756, 8460.62] - - [512, 6000, 1, 2816] - - [697, 9019.55] + - [759, 9019.55] - - [2944, 448, 1, 128] - - [660, 3027.73] + - [722, 3027.73] - - [64, 193600, 1, 256] - - [703, 7080.32] + - [765, 7080.32] - - [128, 2944, 1, 1280] - - [679, 5397.87] + - [741, 5397.87] - - [448, 2944, 1, 1280] - - [689, 6996.97] + - [751, 6996.97] - - [512, 24000, 1, 2048] - - [697, 8832.67] + - [759, 8832.67] - - [128, 256, 1, 3328] - - [636, 3531.57] + - [698, 3531.57] - - [1408, 5056, 1, 3328] - - [692, 7969.94] + - [754, 7969.94] - - [1856, 1856, 1, 3328] - - [679, 8140.34] + - [741, 8140.34] - - [3584, 128, 1, 256] - - [689, 4861.05] + - [751, 4861.05] - - [448, 1408, 1, 3328] - - [679, 6353.75] + - [741, 6353.75] - - [2368, 2368, 1, 256] - - [693, 8369.37] + - [755, 8369.37] - - [4288, 4288, 1, 1280] - - [683, 8666.52] + - [745, 8666.52] - - [64, 448, 1, 1280] - - [636, 2591.92] + - [698, 2591.92] - - [5888, 1024, 1, 1280] - - [676, 8526.6] + - [738, 8526.6] - - [704, 1024, 1, 256] - - [689, 4971.8] + - [751, 4971.8] - - [1024, 12544, 1, 256] - - [727, 8611.9] + - [789, 8611.9] - - [448, 4, 1, 256] - - [632, 78.6534] + - [694, 78.6534] - - [5888, 448, 1, 128] - - [663, 3592.03] + - [725, 3592.03] - - [512, 48000, 1, 2560] - - [697, 9237.44] + - [759, 9237.44] - - [8448, 16, 1, 2816] - - [562, 3360.21] + - [624, 3360.21] - - [704, 6784, 1, 3328] - - [698, 7774.95] + - [760, 7774.95] - - [5888, 5888, 1, 1280] - - [684, 9238.25] + - [746, 9238.25] - - [5056, 1024, 1, 1280] - - [712, 8227.88] + - [774, 8227.88] - - [448, 5888, 1, 3328] - - [687, 7777.63] + - [749, 7777.63] - - [3072, 2, 1, 1024] - - [624, 376.383] + - [686, 376.383] - - [1024, 2944, 1, 1280] - - [677, 8650.45] + - [739, 8650.45] - - [5056, 5888, 1, 1280] - - [687, 8861.6] + - [749, 8861.6] - - [4288, 5888, 1, 128] - - [667, 5049.01] + - [729, 5049.01] - - [256, 3584, 1, 256] - - [679, 6314.11] + - [741, 6314.11] - - [256, 4, 1, 1280] - - [723, 163.94] + - [785, 163.94] - - [1408, 3584, 1, 128] - - [667, 4290.22] + - [729, 4290.22] - - [256, 2944, 1, 3328] - - [689, 7620.99] + - [751, 7620.99] - - [448, 3584, 1, 128] - - [667, 3353.9] + - [729, 3353.9] - - [5888, 2944, 1, 1280] - - [677, 9498.31] + - [739, 9498.31] - - [4, 6784, 1, 1280] - - [627, 623.916] + - [689, 623.916] - - [2368, 5888, 1, 128] - - [666, 4840.29] + - [728, 4840.29] - - [35, 8457, 1, 1760] - - [573, 4059.88] + - [635, 4059.88] - - [64, 2944, 1, 128] - - [584, 1310.82] + - [646, 1310.82] - - [2368, 4, 1, 256] - - [718, 369.739] + - [780, 369.739] - - [3584, 5888, 1, 256] - - [695, 7996.33] + - [757, 7996.33] - - [2368, 1024, 1, 128] - - [661, 3915.07] + - [723, 3915.07] - - [2368, 704, 1, 128] - - [661, 3658.97] + - [723, 3658.97] - - [512, 32, 1, 512] - - [650, 1127.6] + - [712, 1127.6] - - [3584, 2368, 1, 128] - - [661, 4462.48] + - [723, 4462.48] - - [5056, 704, 1, 128] - - [660, 4062.21] + - [722, 4062.21] - - [448, 2368, 1, 128] - - [661, 2829.07] + - [723, 2829.07] - - [4, 5056, 1, 256] - - [609, 425.868] + - [671, 425.868] - - [5056, 1408, 1, 3328] - - [694, 8848.92] + - [756, 8848.92] - - [1408, 704, 1, 256] - - [689, 5394.56] + - [751, 5394.56] - - [6784, 1024, 1, 3328] - - [676, 9232.02] + - [738, 9232.02] - - [6784, 2944, 1, 3328] - - [687, 8714.84] + - [749, 8714.84] - - [7680, 1, 1, 2560] - - [623, 248.845] + - [685, 248.845] - - [1856, 1856, 1, 256] - - [688, 7586.58] + - [750, 7586.58] - - [64, 64, 1, 3328] - - [658, 1363.25] + - [720, 1363.25] - - [512, 1, 1, 512] - - [572, 43.2158] + - [634, 43.2158] - - [6784, 2368, 1, 1280] - - [689, 8665.74] + - [751, 8665.74] - - [4608, 2, 1, 1536] - - [572, 452.65] + - [634, 452.65] - - [4288, 3584, 1, 256] - - [697, 8936.7] + - [759, 8936.7] - - [4288, 5888, 1, 1280] - - [694, 8957.15] + - [756, 8957.15] - - [4608, 4, 1, 1536] - - [565, 846.737] + - [627, 846.737] - - [1024, 6000, 1, 1536] - - [687, 8398.54] + - [749, 8398.54] - - [8448, 32, 1, 2816] - - [647, 5343.07] + - [709, 5343.07] - - [448, 2944, 1, 3328] - - [694, 7247.04] + - [756, 7247.04] - - [4288, 1856, 1, 1280] - - [677, 8902.86] + - [739, 8902.86] - - [1856, 2944, 1, 3328] - - [689, 8622.86] + - [751, 8622.86] - - [256, 6784, 1, 3328] - - [689, 8050.77] + - [751, 8050.77] - - [512, 3000, 1, 1536] - - [710, 7108.12] + - [772, 7108.12] - - [64, 5888, 1, 256] - - [702, 3567.74] + - [764, 3567.74] - - [256, 5056, 1, 128] - - [669, 3041.12] + - [731, 3041.12] - - [5056, 1024, 1, 256] - - [693, 8401.47] + - [755, 8401.47] - - [704, 64, 1, 3328] - - [652, 4299.02] + - [714, 4299.02] - - [5056, 1856, 1, 3328] - - [697, 8660.77] + - [759, 8660.77] - - [4, 2944, 1, 3328] - - [627, 618.637] + - [689, 618.637] - - [512, 1500, 1, 2048] - - [709, 5481.22] + - [771, 5481.22] - - [1024, 1, 1, 500000] - - [563, 260.061] + - [625, 260.061] - - [256, 4, 1, 256] - - [627, 50.5123] + - [689, 50.5123] - - [6784, 128, 1, 3328] - - [681, 6950.91] + - [743, 6950.91] - - [4288, 1408, 1, 128] - - [661, 4539.58] + - [723, 4539.58] - - [1856, 5888, 1, 3328] - - [687, 8712.93] + - [749, 8712.93] - - [4288, 5056, 1, 256] - - [693, 8997.15] + - [755, 8997.15] - - [1408, 128, 1, 1280] - - [616, 4599.12] + - [678, 4599.12] - - [4096, 7000, 1, 4096] - - [683, 8555.89] + - [745, 8555.89] - - [5056, 256, 1, 3328] - - [689, 8257.16] + - [751, 8257.16] - - [704, 704, 1, 256] - - [679, 5852.39] + - [741, 5852.39] - - [1024, 3000, 1, 2560] - - [676, 8258.84] + - [738, 8258.84] - - [1024, 5888, 1, 1280] - - [676, 8988.99] + - [738, 8988.99] - - [6784, 2368, 1, 128] - - [662, 4562.25] + - [724, 4562.25] - - [4, 5056, 1, 1280] - - [627, 600.441] + - [689, 600.441] - - [256, 64, 1, 1280] - - [650, 1899.69] + - [712, 1899.69] - - [128, 1856, 1, 1280] - - [689, 5185.76] + - [751, 5185.76] - - [1856, 1024, 1, 1280] - - [694, 7875.95] + - [756, 7875.95] - - [6784, 4288, 1, 1280] - - [697, 8981.18] + - [759, 8981.18] - - [1856, 1856, 1, 1280] - - [678, 7794.71] + - [740, 7794.71] - - [35, 1500, 1, 2048] - - [578, 2192.6] + - [640, 2192.6] - - [3072, 24000, 1, 1024] - - [690, 8690.58] + - [752, 8690.58] - - [1408, 5056, 1, 1280] - - [689, 8427.87] + - [751, 8427.87] - - [4, 2368, 1, 3328] - - [632, 594.422] + - [694, 594.422] - - [5888, 1856, 1, 128] - - [661, 4294.05] + - [723, 4294.05] - - [448, 704, 1, 1280] - - [684, 4136.39] + - [746, 4136.39] - - [448, 6784, 1, 128] - - [662, 3976.2] + - [724, 3976.2] - - [1024, 448, 1, 3328] - - [694, 6376.33] + - [756, 6376.33] - - [2944, 128, 1, 256] - - [679, 4466.26] + - [741, 4466.26] - - [5056, 3584, 1, 128] - - [667, 4997.18] + - [729, 4997.18] - - [5888, 5888, 1, 3328] - - [697, 8870.37] + - [759, 8870.37] - - [6784, 1024, 1, 256] - - [676, 8520.53] + - [738, 8520.53] - - [2944, 2368, 1, 256] - - [713, 6174.59] + - [775, 6174.59] - - [256, 448, 1, 256] - - [689, 1844.33] + - [751, 1844.33] - - [5056, 5888, 1, 3328] - - [678, 8076.65] + - [740, 8076.65] - - [1856, 1024, 1, 256] - - [689, 7188.92] + - [751, 7188.92] - - [512, 48000, 1, 1536] - - [700, 7282.2] + - [762, 7282.2] - - [3584, 448, 1, 1280] - - [679, 6869.1] + - [741, 6869.1] - - [1024, 1024, 1, 1280] - - [689, 8027.45] + - [751, 8027.45] - - [448, 5888, 1, 256] - - [679, 5765.84] + - [741, 5765.84] - - [2048, 128, 1, 2048] - - [637, 4835.01] + - [699, 4835.01] - - [1408, 6784, 1, 3328] - - [689, 8613.76] + - [751, 8613.76] - - [448, 1024, 1, 128] - - [660, 2315.57] + - [722, 2315.57] - - [4288, 704, 1, 128] - - [661, 4138.92] + - [723, 4138.92] - - [128, 1856, 1, 128] - - [596, 1397.56] + - [658, 1397.56] - - [448, 2368, 1, 3328] - - [679, 6786.48] + - [741, 6786.48] - - [5056, 64, 1, 128] - - [661, 1664.84] + - [723, 1664.84] - - [5056, 2944, 1, 256] - - [712, 7697.49] + - [774, 7697.49] - - [6784, 5888, 1, 128] - - [661, 5003.67] + - [723, 5003.67] - - [1024, 700, 1, 512] - - [689, 6036.31] + - [751, 6036.31] - - [3072, 1, 1, 128] - - [643, 70.3171] + - [705, 70.3171] - - [1024, 4, 1, 256] - - [601, 154.302] + - [663, 154.302] - - [2944, 704, 1, 128] - - [667, 3697.0] + - [729, 3697.0] - - [128, 6784, 1, 1280] - - [679, 6731.51] + - [741, 6731.51] - - [1408, 3584, 1, 3328] - - [677, 9258.07] + - [739, 9258.07] - - [2368, 6784, 1, 256] - - [676, 8840.4] + - [738, 8840.4] - - [5056, 1408, 1, 1280] - - [677, 9240.84] + - [739, 9240.84] - - [5056, 4288, 1, 128] - - [672, 4309.18] + - [734, 4309.18] - - [4, 704, 1, 256] - - [627, 130.697] + - [689, 130.697] - - [4288, 2368, 1, 3328] - - [690, 8755.33] + - [752, 8755.33] - - [1408, 1856, 1, 128] - - [660, 3918.75] + - [722, 3918.75] - - [1408, 5888, 1, 3328] - - [697, 8910.47] + - [759, 8910.47] - - [1856, 256, 1, 256] - - [679, 5631.34] + - [741, 5631.34] - - [6784, 6784, 1, 256] - - [687, 9298.76] + - [749, 9298.76] - - [5888, 5056, 1, 128] - - [662, 4811.36] + - [724, 4811.36] - - [4288, 2368, 1, 128] - - [661, 4749.1] + - [723, 4749.1] - - [128, 5888, 1, 1280] - - [688, 6393.86] + - [750, 6393.86] - - [256, 4288, 1, 1280] - - [679, 6887.79] + - [741, 6887.79] - - [2368, 2944, 1, 256] - - [693, 8314.82] + - [755, 8314.82] - - [4, 1856, 1, 256] - - [716, 267.03] + - [778, 267.03] - - [3584, 1856, 1, 1280] - - [677, 8631.91] + - [739, 8631.91] - - [6784, 6784, 1, 128] - - [667, 5059.96] + - [729, 5059.96] - - [256, 1856, 1, 128] - - [660, 1858.82] + - [722, 1858.82] - - [49, 512, 64, 2048] - - [731, 3053.67] + - [793, 3053.67] - - [704, 64, 1, 1280] - - [610, 2849.49] + - [672, 2849.49] - - [5888, 5056, 1, 256] - - [696, 8202.52] + - [758, 8202.52] - - [8448, 48000, 1, 2816] - - [687, 4281.94] + - [749, 4281.94] - - [512, 6000, 1, 2048] - - [679, 8047.89] + - [741, 8047.89] - - [3584, 448, 1, 256] - - [689, 6805.43] + - [751, 6805.43] - - [448, 4288, 1, 128] - - [667, 3500.83] + - [729, 3500.83] - - [7680, 64, 1, 2560] - - [622, 5957.9] + - [684, 5957.9] - - [256, 6784, 1, 256] - - [689, 7331.83] + - [751, 7331.83] - - [1408, 4288, 1, 128] - - [661, 4501.49] + - [723, 4501.49] - - [2944, 704, 1, 3328] - - [689, 8439.7] + - [751, 8439.7] - - [128, 448, 1, 256] - - [610, 1555.19] + - [672, 1555.19] - - [2048, 32, 1, 2048] - - [621, 3226.49] + - [683, 3226.49] - - [3584, 3584, 1, 256] - - [693, 8784.9] + - [755, 8784.9] - - [448, 1408, 1, 128] - - [660, 2535.92] + - [722, 2535.92] - - [128, 256, 1, 1280] - - [636, 2896.72] + - [698, 2896.72] - - [3584, 5056, 1, 256] - - [680, 8566.52] + - [742, 8566.52] - - [6784, 128, 1, 256] - - [679, 6053.97] + - [741, 6053.97] - - [4288, 4, 1, 256] - - [599, 428.9] + - [661, 428.9] - - [64, 1408, 1, 3328] - - [604, 5025.11] + - [666, 5025.11] - - [704, 448, 1, 256] - - [703, 3409.74] + - [765, 3409.74] - - [2944, 2368, 1, 1280] - - [677, 9066.35] + - [739, 9066.35] - - [448, 64, 1, 3328] - - [652, 3528.96] + - [714, 3528.96] - - [704, 6784, 1, 128] - - [666, 4212.61] + - [728, 4212.61] - - [3584, 4, 1, 3328] - - [719, 658.353] + - [781, 658.353] - - [6784, 3584, 1, 256] - - [687, 9061.84] + - [749, 9061.84] - - [704, 448, 1, 128] - - [666, 1552.8] + - [728, 1552.8] - - [256, 128, 1, 128] - - [591, 281.975] + - [653, 281.975] - - [704, 1408, 1, 128] - - [666, 3026.76] + - [728, 3026.76] - - [4, 448, 1, 128] - - [715, 5.56127] + - [777, 5.56127] - - [4288, 128, 1, 1280] - - [646, 5471.64] + - [708, 5471.64] - - [128, 1408, 1, 256] - - [689, 2813.35] + - [751, 2813.35] - - [4, 2944, 1, 256] - - [609, 316.766] + - [671, 316.766] - - [64, 128, 1, 3328] - - [657, 1872.56] + - [719, 1872.56] - - [1856, 1408, 1, 256] - - [679, 7735.89] + - [741, 7735.89] - - [5056, 2368, 1, 128] - - [661, 4830.19] + - [723, 4830.19] - - [2944, 2944, 1, 3328] - - [697, 8890.11] + - [759, 8890.11] - - [5056, 6784, 1, 256] - - [687, 9015.25] + - [749, 9015.25] - - [1856, 3584, 1, 128] - - [668, 4455.12] + - [730, 4455.12] - - [5888, 4, 1, 1280] - - [717, 642.063] + - [779, 642.063] - - [128, 2944, 1, 128] - - [586, 2037.03] + - [648, 2037.03] - - [35, 8457, 1, 2560] - - [574, 3988.23] + - [636, 3988.23] - - [3584, 6784, 1, 128] - - [661, 4774.54] + - [723, 4774.54] - - [128, 4288, 1, 256] - - [679, 4851.85] + - [741, 4851.85] - - [704, 448, 1, 3328] - - [694, 4432.63] + - [756, 4432.63] - - [2368, 6784, 1, 1280] - - [677, 9161.48] + - [739, 9161.48] - - [128, 128, 1, 3328] - - [651, 2839.99] + - [713, 2839.99] - - [5056, 1856, 1, 256] - - [693, 8380.94] + - [755, 8380.94] - - [256, 128, 1, 256] - - [635, 1165.18] + - [697, 1165.18] - - [1024, 3000, 1, 2816] - - [694, 8714.27] + - [756, 8714.27] - - [1024, 1856, 1, 256] - - [684, 7014.79] + - [746, 7014.79] - - [64, 1, 1, 1216] - - [657, 11.8205] + - [719, 11.8205] - - [4288, 64, 1, 128] - - [588, 1669.65] + - [650, 1669.65] - - [256, 448, 1, 3328] - - [612, 5152.39] + - [674, 5152.39] - - [1408, 6784, 1, 1280] - - [697, 8735.22] + - [759, 8735.22] - - [3584, 3584, 1, 1280] - - [694, 9020.09] + - [756, 9020.09] - - [7680, 24000, 1, 2560] - - [697, 6940.24] + - [759, 6940.24] - - [64, 2368, 1, 1280] - - [607, 4433.07] + - [669, 4433.07] - - [448, 2368, 1, 1280] - - [682, 5352.92] + - [744, 5352.92] - - [4608, 48000, 1, 1536] - - [676, 8129.11] + - [738, 8129.11] - - [5888, 5888, 1, 128] - - [669, 4700.91] + - [731, 4700.91] - - [64, 6784, 1, 3328] - - [679, 6170.82] + - [741, 6170.82] - - [2944, 256, 1, 1280] - - [709, 6177.65] + - [771, 6177.65] - - [2048, 16, 1, 2048] - - [631, 2167.7] + - [693, 2167.7] - - [256, 2368, 1, 128] - - [660, 2037.77] + - [722, 2037.77] - - [5056, 2368, 1, 3328] - - [677, 9040.6] + - [739, 9040.6] - - [2944, 4288, 1, 256] - - [708, 7552.22] + - [770, 7552.22] - - [1408, 3584, 1, 1280] - - [684, 8808.76] + - [746, 8808.76] - - [2368, 64, 1, 256] - - [620, 2320.51] + - [682, 2320.51] - - [1024, 128, 1, 128] - - [580, 1075.56] + - [642, 1075.56] - - [704, 128, 1, 3328] - - [613, 4985.02] + - [675, 4985.02] - - [5888, 4, 1, 128] - - [714, 33.6558] + - [776, 33.6558] - - [1856, 704, 1, 256] - - [689, 7110.98] + - [751, 7110.98] - - [1024, 1500, 1, 2816] - - [684, 8499.88] + - [746, 8499.88] - - [8448, 1, 1, 2816] - - [567, 251.469] + - [629, 251.469] - - [1024, 4, 1, 3328] - - [723, 541.032] + - [785, 541.032] - - [1024, 6000, 1, 2048] - - [684, 8698.59] + - [746, 8698.59] - - [512, 24000, 1, 2560] - - [677, 8963.7] + - [739, 8963.7] - - [6144, 3000, 1, 2560] - - [700, 8761.97] + - [762, 8761.97] - - [2368, 6784, 1, 3328] - - [694, 8867.49] + - [756, 8867.49] - - [1856, 1408, 1, 1280] - - [681, 7908.53] + - [743, 7908.53] - - [1856, 448, 1, 1280] - - [694, 6544.01] + - [756, 6544.01] - - [6784, 704, 1, 128] - - [660, 4086.45] + - [722, 4086.45] - - [4, 4, 1, 256] - - [627, 0.852941] + - [689, 0.852941] - - [128, 5888, 1, 128] - - [584, 2582.25] + - [646, 2582.25] - - [5056, 2944, 1, 128] - - [664, 4579.17] + - [726, 4579.17] - - [1408, 5888, 1, 256] - - [676, 8810.77] + - [738, 8810.77] - - [704, 2944, 1, 1280] - - [691, 8420.9] + - [753, 8420.9] - - [4288, 64, 1, 1280] - - [616, 4906.15] + - [678, 4906.15] - - [256, 64, 1, 256] - - [618, 689.953] + - [680, 689.953] - - [1024, 1024, 1, 256] - - [694, 5528.01] + - [756, 5528.01] - - [704, 1856, 1, 256] - - [678, 4452.92] + - [740, 4452.92] - - [2560, 64, 1, 2560] - - [607, 4563.09] + - [669, 4563.09] - - [3584, 704, 1, 1280] - - [684, 7898.77] + - [746, 7898.77] - - [256, 128, 1, 1280] - - [636, 2865.06] + - [698, 2865.06] - - [5888, 2368, 1, 256] - - [683, 8628.37] + - [745, 8628.37] - - [256, 2368, 1, 1280] - - [679, 6073.57] + - [741, 6073.57] - - [2944, 6784, 1, 128] - - [660, 4756.77] + - [722, 4756.77] - - [3584, 448, 1, 3328] - - [679, 7265.07] + - [741, 7265.07] - - [1408, 4, 1, 256] - - [720, 234.157] + - [782, 234.157] - - [704, 2368, 1, 3328] - - [677, 7248.98] + - [739, 7248.98] - - [2944, 448, 1, 256] - - [684, 6365.89] + - [746, 6365.89] - - [1856, 448, 1, 128] - - [662, 2976.34] + - [724, 2976.34] - - [4608, 6000, 1, 1536] - - [697, 9469.42] + - [759, 9469.42] - - [2368, 128, 1, 1280] - - [646, 4773.39] + - [708, 4773.39] - - [256, 5888, 1, 128] - - [661, 3112.0] + - [723, 3112.0] - - [64, 6784, 1, 256] - - [679, 3755.14] + - [741, 3755.14] - - [64, 5056, 1, 1280] - - [640, 4935.6] + - [702, 4935.6] - - [4, 6784, 1, 128] - - [715, 111.142] + - [777, 111.142] - - [3025, 64, 64, 64] - - [729, 6643.75] + - [791, 6643.75] - - [2944, 2944, 1, 1280] - - [677, 8869.55] + - [739, 8869.55] - - [5056, 448, 1, 3328] - - [710, 6706.2] + - [772, 6706.2] - - [4, 3584, 1, 1280] - - [627, 573.54] + - [689, 573.54] - - [1408, 128, 1, 128] - - [579, 1293.19] + - [641, 1293.19] - - [6784, 704, 1, 3328] - - [694, 8368.33] + - [756, 8368.33] - - [128, 64, 1, 1280] - - [653, 1260.41] + - [715, 1260.41] - - [2368, 256, 1, 1280] - - [679, 6154.47] + - [741, 6154.47] - - [4, 448, 1, 3328] - - [632, 351.738] + - [694, 351.738] - - [5888, 4288, 1, 128] - - [661, 4340.99] + - [723, 4340.99] - - [4, 5888, 1, 256] - - [609, 428.318] + - [671, 428.318] - - [1408, 2944, 1, 3328] - - [676, 9400.85] + - [738, 9400.85] - - [3584, 704, 1, 128] - - [663, 3392.55] + - [725, 3392.55] - - [64, 1024, 1, 256] - - [610, 1762.41] + - [672, 1762.41] - - [2368, 448, 1, 1280] - - [703, 5972.58] + - [765, 5972.58] - - [128, 3584, 1, 256] - - [679, 5224.32] + - [741, 5224.32] - - [704, 448, 1, 1280] - - [679, 4566.86] + - [741, 4566.86] - - [448, 5056, 1, 128] - - [661, 3876.19] + - [723, 3876.19] - - [6144, 4, 1, 2560] - - [603, 948.751] + - [665, 948.751] - - [5056, 3584, 1, 256] - - [693, 8162.56] + - [755, 8162.56] - - [4288, 4288, 1, 256] - - [700, 7653.34] + - [762, 7653.34] - - [1408, 5056, 1, 128] - - [667, 4554.34] + - [729, 4554.34] - - [2944, 3584, 1, 128] - - [673, 4147.0] + - [735, 4147.0] - - [3584, 2368, 1, 256] - - [694, 8195.05] + - [756, 8195.05] - - [5888, 5056, 1, 1280] - - [693, 9413.43] + - [755, 9413.43] - - [128, 1024, 1, 1280] - - [646, 4433.83] + - [708, 4433.83] - - [8448, 24000, 1, 2816] - - [687, 5227.12] + - [749, 5227.12] - - [64, 704, 1, 256] - - [610, 1441.89] + - [672, 1441.89] - - [4288, 256, 1, 1280] - - [709, 5687.8] + - [771, 5687.8] - - [3584, 3584, 1, 3328] - - [684, 9183.63] + - [746, 9183.63] - - [704, 64, 1, 128] - - [588, 402.835] + - [650, 402.835] - - [3072, 1500, 1, 128] - - [683, 7395.08] + - [745, 7395.08] - - [2048, 3136, 1, 512] - - [725, 8447.3] + - [787, 8447.3] - - [3025, 256, 64, 64] - - [733, 8063.79] + - [795, 8063.79] - - [5888, 6784, 1, 256] - - [677, 9282.01] + - [739, 9282.01] - - [4288, 2944, 1, 3328] - - [677, 9153.87] + - [739, 9153.87] - - [2944, 64, 1, 128] - - [594, 1463.53] + - [656, 1463.53] - - [1024, 128, 1, 3328] - - [644, 5377.41] + - [706, 5377.41] - - [1024, 16, 1, 500000] - - [560, 3997.13] + - [622, 3997.13] - - [4288, 128, 1, 3328] - - [648, 6053.31] + - [710, 6053.31] - - [7680, 128, 1, 2560] - - [694, 7769.24] + - [756, 7769.24] - - [256, 5056, 1, 1280] - - [703, 7200.84] + - [765, 7200.84] - - [1408, 256, 1, 128] - - [671, 1671.74] + - [733, 1671.74] - - [2944, 5888, 1, 3328] - - [683, 8642.18] + - [745, 8642.18] - - [6784, 5888, 1, 1280] - - [697, 8871.15] + - [759, 8871.15] - - [3072, 1, 1, 1024] - - [643, 205.972] + - [705, 205.972] - - [704, 128, 1, 256] - - [606, 1935.39] + - [668, 1935.39] - - [5888, 4288, 1, 1280] - - [684, 9176.7] + - [746, 9176.7] - - [1024, 24000, 1, 2048] - - [683, 8667.79] + - [745, 8667.79] - - [448, 256, 1, 1280] - - [616, 4327.95] + - [678, 4327.95] - - [5888, 3584, 1, 128] - - [661, 4669.45] + - [723, 4669.45] - - [64, 4288, 1, 3328] - - [641, 5375.04] + - [703, 5375.04] - - [448, 4, 1, 1280] - - [632, 289.716] + - [694, 289.716] - - [6784, 6784, 1, 3328] - - [690, 8306.73] + - [752, 8306.73] - - [5056, 4, 1, 1280] - - [602, 607.199] + - [664, 607.199] - - [4, 5888, 1, 3328] - - [627, 651.538] + - [689, 651.538] - - [256, 1408, 1, 1280] - - [679, 5177.09] + - [741, 5177.09] - - [3072, 16, 1, 1024] - - [638, 2207.63] + - [700, 2207.63] - - [704, 3584, 1, 128] - - [671, 3653.51] + - [733, 3653.51] - - [1024, 2, 1, 512] - - [658, 156.138] + - [720, 156.138] - - [5888, 448, 1, 3328] - - [679, 7896.85] + - [741, 7896.85] - - [2368, 4288, 1, 1280] - - [676, 8517.63] + - [738, 8517.63] - - [4288, 2944, 1, 128] - - [665, 4439.26] + - [727, 4439.26] - - [256, 64, 1, 3328] - - [651, 2704.76] + - [713, 2704.76] - - [2944, 64, 1, 3328] - - [616, 5647.15] + - [678, 5647.15] - - [6784, 64, 1, 3328] - - [689, 6434.61] + - [751, 6434.61] - - [5056, 2944, 1, 3328] - - [700, 8497.2] + - [762, 8497.2] - - [448, 128, 1, 256] - - [618, 1516.64] + - [680, 1516.64] - - [2944, 3584, 1, 256] - - [694, 8365.83] + - [756, 8365.83] - - [1408, 1408, 1, 3328] - - [677, 8440.42] + - [739, 8440.42] - - [1856, 128, 1, 1280] - - [679, 5242.93] + - [741, 5242.93] - - [3584, 3584, 1, 128] - - [661, 4385.94] + - [723, 4385.94] - - [64, 3584, 1, 256] - - [679, 3276.9] + - [741, 3276.9] - - [1408, 4, 1, 3328] - - [602, 605.504] + - [664, 605.504] - - [128, 2944, 1, 3328] - - [647, 6295.75] + - [709, 6295.75] - - [3584, 704, 1, 256] - - [684, 7711.64] + - [746, 7711.64] - - [2944, 448, 1, 3328] - - [695, 6503.97] + - [757, 6503.97] - - [1024, 2, 1, 500000] - - [564, 521.803] + - [626, 521.803] - - [3584, 1408, 1, 3328] - - [686, 8296.2] + - [748, 8296.2] - - [704, 3584, 1, 1280] - - [691, 7670.65] + - [753, 7670.65] - - [1024, 1408, 1, 128] - - [666, 2830.61] + - [728, 2830.61] - - [1856, 6784, 1, 256] - - [697, 8149.67] + - [759, 8149.67] - - [4288, 448, 1, 3328] - - [678, 7406.44] + - [740, 7406.44] - - [6784, 4288, 1, 128] - - [673, 4418.09] + - [735, 4418.09] - - [6784, 704, 1, 1280] - - [694, 8302.45] + - [756, 8302.45] - - [6144, 1, 1, 2560] - - [603, 243.427] + - [665, 243.427] - - [3584, 6784, 1, 256] - - [676, 9036.59] + - [738, 9036.59] - - [6144, 16, 1, 2560] - - [610, 3266.69] + - [672, 3266.69] - - [3584, 64, 1, 128] - - [594, 1555.19] + - [656, 1555.19] - - [5888, 1024, 1, 3328] - - [684, 8888.08] + - [746, 8888.08] - - [448, 64, 1, 128] - - [580, 248.074] + - [642, 248.074] - - [704, 6784, 1, 1280] - - [680, 7892.56] + - [742, 7892.56] - - [4, 448, 1, 256] - - [602, 70.8951] + - [664, 70.8951] - - [196, 1024, 64, 256] - - [728, 6630.86] + - [790, 6630.86] - - [5888, 128, 1, 256] - - [678, 5715.09] + - [740, 5715.09] - - [4096, 16, 1, 4096] - - [624, 3251.5] + - [686, 3251.5] - - [1856, 5056, 1, 3328] - - [693, 8740.27] + - [755, 8740.27] - - [4, 6784, 1, 256] - - [716, 360.412] + - [778, 360.412] - - [1024, 3584, 1, 128] - - [661, 3456.27] + - [723, 3456.27] - - [64, 704, 1, 3328] - - [629, 3817.47] + - [691, 3817.47] - - [2368, 2944, 1, 128] - - [667, 4605.47] + - [729, 4605.47] - - [5056, 64, 1, 256] - - [679, 3863.79] + - [741, 3863.79] - - [512, 1500, 1, 1536] - - [679, 6801.56] + - [741, 6801.56] - - [512, 1, 1, 500000] - - [568, 261.068] + - [630, 261.068] - - [5888, 2944, 1, 3328] - - [683, 8501.88] + - [745, 8501.88] - - [128, 3584, 1, 1280] - - [684, 5938.64] + - [746, 5938.64] - - [1024, 704, 1, 128] - - [670, 2172.29] + - [732, 2172.29] - - [1408, 2368, 1, 128] - - [666, 4023.2] + - [728, 4023.2] - - [5888, 2368, 1, 128] - - [667, 4424.62] + - [729, 4424.62] - - [128, 5056, 1, 3328] - - [679, 6692.16] + - [741, 6692.16] - - [3584, 6784, 1, 1280] - - [677, 9488.64] + - [739, 9488.64] - - [4288, 1856, 1, 256] - - [687, 8287.52] + - [749, 8287.52] - - [1856, 5888, 1, 256] - - [698, 7707.83] + - [760, 7707.83] - - [256, 256, 1, 256] - - [645, 1613.29] + - [707, 1613.29] - - [4288, 4288, 1, 3328] - - [687, 8923.59] + - [749, 8923.59] - - [1024, 1024, 1, 128] - - [667, 2553.71] + - [729, 2553.71] - - [4288, 1408, 1, 1280] - - [687, 8930.47] + - [749, 8930.47] - - [3584, 5056, 1, 128] - - [671, 4495.15] + - [733, 4495.15] - - [4, 1024, 1, 3328] - - [627, 415.694] + - [689, 415.694] - - [4, 704, 1, 128] - - [715, 13.9634] + - [777, 13.9634] - - [4288, 2368, 1, 256] - - [712, 7135.08] + - [774, 7135.08] - - [2944, 5056, 1, 1280] - - [684, 9118.61] + - [746, 9118.61] - - [448, 6784, 1, 256] - - [708, 5430.31] + - [770, 5430.31] - - [64, 128, 1, 128] - - [591, 83.057] + - [653, 83.057] - - [1856, 2368, 1, 128] - - [667, 4422.75] + - [729, 4422.75] - - [6784, 2368, 1, 3328] - - [680, 8769.4] + - [742, 8769.4] - - [1408, 6784, 1, 128] - - [667, 4739.0] + - [729, 4739.0] - - [256, 1024, 1, 1280] - - [689, 5722.21] + - [751, 5722.21] - - [704, 4, 1, 128] - - [715, 8.66578] + - [777, 8.66578] - - [1408, 4, 1, 128] - - [715, 26.1439] + - [777, 26.1439] - - [4288, 128, 1, 256] - - [689, 4865.38] + - [751, 4865.38] - - [4288, 1856, 1, 3328] - - [676, 9250.04] + - [738, 9250.04] - - [3584, 448, 1, 128] - - [667, 3029.59] + - [729, 3029.59] - - [64, 4288, 1, 128] - - [584, 1535.38] + - [646, 1535.38] - - [64, 448, 1, 3328] - - [654, 3457.36] + - [716, 3457.36] - - [448, 4, 1, 3328] - - [632, 367.328] + - [694, 367.328] - - [256, 4, 1, 3328] - - [723, 320.389] + - [785, 320.389] - - [4, 1408, 1, 1280] - - [720, 344.039] + - [782, 344.039] - - [3584, 64, 1, 1280] - - [608, 5191.07] + - [670, 5191.07] - - [1408, 448, 1, 128] - - [668, 2218.24] + - [730, 2218.24] - - [3584, 1024, 1, 1280] - - [690, 8253.11] + - [752, 8253.11] - - [1856, 5056, 1, 256] - - [708, 7552.55] + - [770, 7552.55] - - [4, 3584, 1, 256] - - [627, 325.456] + - [689, 325.456] - - [6784, 4288, 1, 3328] - - [683, 8655.34] + - [745, 8655.34] - - [4, 2944, 1, 1280] - - [627, 547.821] + - [689, 547.821] - - [1024, 4288, 1, 256] - - [684, 7788.83] + - [746, 7788.83] - - [5888, 3584, 1, 3328] - - [687, 9173.39] + - [749, 9173.39] - - [1856, 4, 1, 256] - - [718, 282.919] + - [780, 282.919] - - [4, 256, 1, 256] - - [627, 49.7485] + - [689, 49.7485] - - [5056, 3584, 1, 3328] - - [693, 8457.53] + - [755, 8457.53] - - [1408, 128, 1, 3328] - - [647, 5714.52] + - [709, 5714.52] - - [4, 64, 1, 1280] - - [723, 42.7667] + - [785, 42.7667] - - [2368, 1408, 1, 1280] - - [684, 8224.92] + - [746, 8224.92] - - [5056, 2944, 1, 1280] - - [676, 9295.13] + - [738, 9295.13] - - [8448, 6000, 1, 2816] - - [680, 8037.97] + - [742, 8037.97] - - [4, 4, 1, 128] - - [715, 0.1433898] + - [777, 0.1433898] - - [3584, 256, 1, 256] - - [679, 6116.79] + - [741, 6116.79] - - [3584, 2944, 1, 1280] - - [676, 8796.49] + - [738, 8796.49] - - [1024, 6784, 1, 256] - - [683, 8187.86] + - [745, 8187.86] - - [4, 128, 1, 256] - - [627, 30.4407] + - [689, 30.4407] - - [6784, 448, 1, 256] - - [679, 7862.3] + - [741, 7862.3] - - [5124, 9124, 1, 2048] - - [681, 8176.41] + - [743, 8176.41] - - [2944, 5056, 1, 3328] - - [676, 9328.34] + - [738, 9328.34] - - [6784, 4, 1, 128] - - [714, 204.9] + - [776, 204.9] - - [2944, 1408, 1, 128] - - [665, 3838.2] + - [727, 3838.2] - - [448, 128, 1, 3328] - - [630, 4632.16] + - [692, 4632.16] - - [64, 2944, 1, 3328] - - [647, 5663.47] + - [709, 5663.47] - - [5056, 6784, 1, 3328] - - [683, 8420.17] + - [745, 8420.17] - - [704, 2368, 1, 128] - - [667, 3321.79] + - [729, 3321.79] - - [3072, 1500, 1, 1024] - - [684, 8221.77] + - [746, 8221.77] - - [128, 2944, 1, 256] - - [679, 4550.52] + - [741, 4550.52] - - [128, 6784, 1, 128] - - [584, 2767.76] + - [646, 2767.76] - - [3584, 4288, 1, 256] - - [683, 8808.64] + - [745, 8808.64] - - [448, 1856, 1, 256] - - [688, 5166.63] + - [750, 5166.63] - - [1856, 6784, 1, 3328] - - [680, 8339.76] + - [742, 8339.76] - - [3584, 128, 1, 3328] - - [689, 6791.57] + - [751, 6791.57] - - [64, 1856, 1, 256] - - [611, 2210.03] + - [673, 2210.03] - - [64, 448, 1, 256] - - [643, 1008.35] + - [705, 1008.35] - - [5888, 4288, 1, 256] - - [683, 8869.63] + - [745, 8869.63] - - [128, 1500, 1, 1280] - - [640, 4733.54] + - [702, 4733.54] - - [5056, 1408, 1, 256] - - [681, 7523.31] + - [743, 7523.31] - - [35, 8457, 1, 4096] - - [574, 4023.17] + - [636, 4023.17] - - [64, 256, 1, 1280] - - [635, 1941.91] + - [697, 1941.91] - - [2944, 4, 1, 128] - - [714, 95.7426] + - [776, 95.7426] - - [3584, 1024, 1, 256] - - [706, 6553.68] + - [768, 6553.68] - - [512, 6000, 1, 1536] - - [680, 7357.25] + - [742, 7357.25] - - [256, 704, 1, 256] - - [679, 2912.81] + - [741, 2912.81] - - [5888, 5888, 1, 256] - - [690, 8802.7] + - [752, 8802.7] - - [4288, 1024, 1, 1280] - - [683, 8248.83] + - [745, 8248.83] - - [5888, 128, 1, 3328] - - [633, 6848.59] + - [695, 6848.59] - - [448, 6784, 1, 3328] - - [679, 8343.78] + - [741, 8343.78] - - [2944, 1408, 1, 1280] - - [676, 9229.48] + - [738, 9229.48] - - [3072, 6000, 1, 1024] - - [697, 9015.01] + - [759, 9015.01] - - [1024, 32, 1, 512] - - [618, 1498.07] + - [680, 1498.07] - - [2944, 1856, 1, 3328] - - [693, 7176.48] + - [755, 7176.48] - - [2368, 64, 1, 128] - - [584, 1206.48] + - [646, 1206.48] - - [256, 1024, 1, 128] - - [661, 1178.28] + - [723, 1178.28] - - [3584, 5888, 1, 1280] - - [683, 9023.58] + - [745, 9023.58] - - [64, 4, 1, 128] - - [715, 1.089372] + - [777, 1.089372] - - [6784, 1856, 1, 1280] - - [677, 8964.51] + - [739, 8964.51] - - [2944, 5056, 1, 256] - - [683, 8860.12] + - [745, 8860.12] - - [5888, 256, 1, 3328] - - [694, 8308.66] + - [756, 8308.66] - - [2944, 4288, 1, 128] - - [662, 4507.61] + - [724, 4507.61] - - [3584, 1408, 1, 256] - - [677, 8234.71] + - [739, 8234.71] - - [704, 3584, 1, 3328] - - [689, 7377.26] + - [751, 7377.26] - - [5056, 448, 1, 1280] - - [678, 7145.47] + - [740, 7145.47] - - [3584, 1856, 1, 3328] - - [694, 8954.81] + - [756, 8954.81] - - [64, 1408, 1, 128] - - [591, 731.974] + - [653, 731.974] - - [4288, 6784, 1, 1280] - - [683, 9166.55] + - [745, 9166.55] - - [1024, 3000, 1, 2048] - - [694, 7723.83] + - [756, 7723.83] - - [1408, 704, 1, 1280] - - [684, 7863.1] + - [746, 7863.1] - - [2944, 1024, 1, 256] - - [677, 5035.02] + - [739, 5035.02] - - [256, 64, 1, 128] - - [583, 150.757] + - [645, 150.757] - - [2368, 4288, 1, 3328] - - [681, 8568.84] + - [743, 8568.84] - - [4, 1408, 1, 256] - - [627, 219.885] + - [689, 219.885] - - [1024, 1408, 1, 1280] - - [709, 6761.13] + - [771, 6761.13] - - [64, 64, 1, 256] - - [609, 198.694] + - [671, 198.694] - - [704, 256, 1, 3328] - - [679, 4291.62] + - [741, 4291.62] - - [6784, 5056, 1, 256] - - [678, 8545.02] + - [740, 8545.02] - - [1856, 1856, 1, 128] - - [666, 4034.93] + - [728, 4034.93] - - [4288, 5888, 1, 256] - - [697, 8998.05] + - [759, 8998.05] - - [4, 704, 1, 3328] - - [632, 452.4] + - [694, 452.4] - - [35, 8457, 1, 2048] - - [575, 3375.37] + - [637, 3375.37] - - [448, 2944, 1, 256] - - [679, 6346.74] + - [741, 6346.74] - - [4, 4288, 1, 3328] - - [632, 630.978] + - [694, 630.978] - - [2944, 6784, 1, 256] - - [706, 8002.92] + - [768, 8002.92] - - [2944, 2944, 1, 128] - - [661, 4661.41] + - [723, 4661.41] - - [4, 4, 1, 1280] - - [632, 3.14762] + - [694, 3.14762] - - [1856, 3584, 1, 1280] - - [676, 8677.66] + - [738, 8677.66] - - [64, 2944, 1, 256] - - [679, 2926.95] + - [741, 2926.95] - - [3584, 1408, 1, 1280] - - [690, 8238.9] + - [752, 8238.9] - - [448, 256, 1, 128] - - [591, 1042.72] + - [653, 1042.72] - - [4288, 448, 1, 128] - - [667, 3698.82] + - [729, 3698.82] - - [5056, 256, 1, 1280] - - [684, 7058.5] + - [746, 7058.5] - - [1856, 1408, 1, 3328] - - [681, 8348.35] + - [743, 8348.35] - - [128, 128, 1, 128] - - [591, 145.736] + - [653, 145.736] - - [1024, 4288, 1, 3328] - - [677, 8042.61] + - [739, 8042.61] - - [448, 2368, 1, 256] - - [689, 5935.0] + - [751, 5935.0] - - [1024, 4, 1, 128] - - [715, 15.93] + - [777, 15.93] - - [64, 1408, 1, 1280] - - [613, 3865.49] + - [675, 3865.49] - - [64, 6784, 1, 1280] - - [709, 5629.61] + - [771, 5629.61] - - [5056, 448, 1, 256] - - [679, 7637.91] + - [741, 7637.91] - - [2944, 2368, 1, 3328] - - [687, 9112.44] + - [749, 9112.44] - - [704, 4288, 1, 3328] - - [679, 7950.2] + - [741, 7950.2] - - [1408, 128, 1, 256] - - [679, 2898.17] + - [741, 2898.17] - - [1024, 1856, 1, 1280] - - [677, 8087.51] + - [739, 8087.51] - - [6784, 1856, 1, 256] - - [708, 7538.25] + - [770, 7538.25] - - [512, 48000, 1, 2816] - - [676, 9704.21] + - [738, 9704.21] - - [512, 3000, 1, 2816] - - [678, 7621.63] + - [740, 7621.63] - - [128, 2368, 1, 3328] - - [641, 6038.94] + - [703, 6038.94] - - [1024, 5888, 1, 256] - - [693, 8185.82] + - [755, 8185.82] - - [64, 2944, 1, 1280] - - [640, 4540.24] + - [702, 4540.24] - - [6784, 1408, 1, 256] - - [693, 8574.0] + - [755, 8574.0] - - [5056, 64, 1, 3328] - - [641, 6310.97] + - [703, 6310.97] - - [128, 704, 1, 128] - - [580, 696.618] + - [642, 696.618] - - [1408, 2368, 1, 256] - - [679, 4995.06] + - [741, 4995.06] - - [1408, 1408, 1, 256] - - [676, 7552.34] + - [738, 7552.34] - - [4, 64, 1, 128] - - [714, 1.90441] + - [776, 1.90441] - - [64, 128, 1, 1280] - - [653, 1272.64] + - [715, 1272.64] - - [1024, 8, 1, 500000] - - [561, 2013.23] + - [623, 2013.23] - - [4, 2368, 1, 128] - - [715, 49.9526] + - [777, 49.9526] - - [2368, 2368, 1, 128] - - [666, 4483.8] + - [728, 4483.8] - - [64, 5888, 1, 128] - - [583, 1957.67] + - [645, 1957.67] - - [5888, 4, 1, 3328] - - [716, 638.798] + - [778, 638.798] - - [6784, 1408, 1, 128] - - [661, 4715.61] + - [723, 4715.61] - - [1408, 5056, 1, 256] - - [693, 8557.67] + - [755, 8557.67] - - [512, 50176, 1, 128] - - [724, 8809.39] + - [786, 8809.39] - - [5056, 128, 1, 3328] - - [616, 6810.66] + - [678, 6810.66] - - [128, 128, 1, 1280] - - [650, 1899.69] + - [712, 1899.69] - - [512, 2, 1, 512] - - [570, 87.4813] + - [632, 87.4813] - - [448, 704, 1, 256] - - [689, 3765.97] + - [751, 3765.97] - - [4288, 3584, 1, 128] - - [674, 4563.77] + - [736, 4563.77] - - [2944, 128, 1, 3328] - - [616, 6507.45] + - [678, 6507.45] - - [128, 5056, 1, 1280] - - [679, 6557.85] + - [741, 6557.85] - - [3584, 5056, 1, 1280] - - [676, 9407.93] + - [738, 9407.93] - - [256, 448, 1, 1280] - - [640, 4096.1] + - [702, 4096.1] - - [704, 704, 1, 128] - - [666, 2374.31] + - [728, 2374.31] - - [5056, 4, 1, 128] - - [714, 125.52] + - [776, 125.52] - - [704, 256, 1, 1280] - - [689, 4016.23] + - [751, 4016.23] - - [64, 2368, 1, 3328] - - [646, 5159.29] + - [708, 5159.29] - - [1856, 1024, 1, 128] - - [666, 3356.47] + - [728, 3356.47] - - [1856, 64, 1, 128] - - [583, 945.644] + - [645, 945.644] - - [4096, 64, 1, 4096] - - [649, 6260.24] + - [711, 6260.24] - - [1024, 24000, 1, 1536] - - [693, 9368.5] + - [755, 9368.5] - - [704, 4288, 1, 256] - - [690, 7329.39] + - [752, 7329.39] - - [5888, 2368, 1, 1280] - - [679, 8624.71] + - [741, 8624.71] - - [6784, 1856, 1, 3328] - - [683, 9012.45] + - [745, 9012.45] - - [64, 128, 1, 256] - - [609, 374.591] + - [671, 374.591] - - [2368, 5888, 1, 1280] - - [677, 9045.76] + - [739, 9045.76] - - [5888, 256, 1, 1280] - - [694, 7999.17] + - [756, 7999.17] - - [4, 5888, 1, 1280] - - [627, 615.839] + - [689, 615.839] - - [704, 128, 1, 128] - - [583, 693.269] + - [645, 693.269] - - [1024, 4, 1, 1280] - - [722, 372.464] + - [784, 372.464] - - [2368, 1856, 1, 3328] - - [694, 8246.91] + - [756, 8246.91] - - [2368, 128, 1, 128] - - [584, 1963.53] + - [646, 1963.53] - - [2944, 704, 1, 256] - - [694, 7116.24] + - [756, 7116.24] - - [5056, 128, 1, 128] - - [587, 2519.49] + - [649, 2519.49] - - [2368, 1024, 1, 3328] - - [679, 7959.13] + - [741, 7959.13] - - [35, 700, 1, 2048] - - [575, 1766.86] + - [637, 1766.86] - - [256, 704, 1, 3328] - - [679, 4296.56] + - [741, 4296.56] - - [704, 3584, 1, 256] - - [678, 7441.61] + - [740, 7441.61] - - [704, 2944, 1, 3328] - - [695, 7195.81] + - [757, 7195.81] - - [6784, 1024, 1, 128] - - [666, 4509.18] + - [728, 4509.18] - - [256, 448, 1, 128] - - [591, 838.003] + - [653, 838.003] - - [448, 1024, 1, 3328] - - [689, 6515.65] + - [751, 6515.65] - - [2944, 1024, 1, 3328] - - [684, 8751.63] + - [746, 8751.63] - - [2944, 5056, 1, 128] - - [661, 4799.73] + - [723, 4799.73] - - [2368, 256, 1, 256] - - [678, 4754.67] + - [740, 4754.67] - - [1408, 6784, 1, 256] - - [706, 7477.09] + - [768, 7477.09] - - [6784, 1408, 1, 3328] - - [684, 8968.57] + - [746, 8968.57] - - [4288, 6784, 1, 128] - - [659, 4455.74] + - [721, 4455.74] - - [1408, 2944, 1, 128] - - [671, 3862.79] + - [733, 3862.79] - - [704, 64, 1, 256] - - [610, 1441.89] + - [672, 1441.89] - - [3072, 4, 1, 1024] - - [628, 711.803] + - [690, 711.803] - - [256, 2368, 1, 3328] - - [703, 5199.73] + - [765, 5199.73] - - [6784, 2944, 1, 1280] - - [687, 8914.45] + - [749, 8914.45] - - [4288, 1856, 1, 128] - - [667, 4683.3] + - [729, 4683.3] - - [1856, 2944, 1, 128] - - [661, 4589.34] + - [723, 4589.34] - - [6784, 448, 1, 128] - - [661, 3918.53] + - [723, 3918.53] - - [64, 3584, 1, 128] - - [592, 1468.11] + - [654, 1468.11] - - [448, 5056, 1, 1280] - - [684, 7561.4] + - [746, 7561.4] - - [4288, 5056, 1, 1280] - - [676, 9304.11] + - [738, 9304.11] - - [2368, 1856, 1, 128] - - [666, 4322.17] + - [728, 4322.17] - - [128, 448, 1, 1280] - - [646, 3336.48] + - [708, 3336.48] - - [4288, 704, 1, 256] - - [689, 7834.65] + - [751, 7834.65] - - [256, 3584, 1, 128] - - [662, 2500.96] + - [724, 2500.96] - - [5888, 704, 1, 256] - - [708, 7244.49] + - [770, 7244.49] - - [3584, 1024, 1, 128] - - [673, 3169.03] + - [735, 3169.03] - - [256, 5888, 1, 3328] - - [694, 7763.47] + - [756, 7763.47] - - [1408, 4288, 1, 3328] - - [676, 9273.8] + - [738, 9273.8] - - [6784, 4288, 1, 256] - - [684, 8825.2] + - [746, 8825.2] - - [4288, 256, 1, 128] - - [663, 2621.54] + - [725, 2621.54] - - [448, 1856, 1, 3328] - - [704, 5859.8] + - [766, 5859.8] - - [5888, 256, 1, 256] - - [694, 7124.84] + - [756, 7124.84] - - [1024, 4, 1, 500000] - - [559, 1030.2] + - [621, 1030.2] - - [6784, 1024, 1, 1280] - - [676, 9083.11] + - [738, 9083.11] - - [5888, 1024, 1, 128] - - [663, 4297.16] + - [725, 4297.16] - - [1024, 128, 1, 256] - - [679, 2086.82] + - [741, 2086.82] - - [512, 16, 1, 500000] - - [560, 3921.96] + - [622, 3921.96] - - [128, 64, 1, 3328] - - [650, 1969.97] + - [712, 1969.97] - - [448, 64, 1, 256] - - [635, 1092.37] + - [697, 1092.37] - - [2368, 256, 1, 128] - - [666, 2174.84] + - [728, 2174.84] - - [6784, 3584, 1, 1280] - - [676, 9558.82] + - [738, 9558.82] - - [1024, 6784, 1, 1280] - - [685, 8637.72] + - [747, 8637.72] - - [2944, 64, 1, 1280] - - [607, 4770.13] + - [669, 4770.13] - - [1408, 2944, 1, 1280] - - [676, 9238.47] + - [738, 9238.47] - - [256, 1856, 1, 256] - - [702, 4498.43] + - [764, 4498.43] - - [1408, 2368, 1, 3328] - - [684, 8344.97] + - [746, 8344.97] - - [2944, 4, 1, 3328] - - [719, 661.209] + - [781, 661.209] - - [128, 1408, 1, 3328] - - [647, 5641.42] + - [709, 5641.42] - - [2944, 1856, 1, 128] - - [661, 4488.04] + - [723, 4488.04] - - [256, 2944, 1, 128] - - [671, 2233.18] + - [733, 2233.18] - - [256, 6784, 1, 128] - - [660, 3139.9] + - [722, 3139.9] - - [2368, 4, 1, 128] - - [715, 38.7612] + - [777, 38.7612] - - [1408, 256, 1, 3328] - - [711, 4927.67] + - [773, 4927.67] - - [1856, 4, 1, 128] - - [715, 42.3719] + - [777, 42.3719] - - [1024, 16, 1, 512] - - [627, 1115.61] + - [689, 1115.61] - - [5056, 6784, 1, 128] - - [662, 4963.45] + - [724, 4963.45] - - [4288, 5056, 1, 128] - - [660, 4928.09] + - [722, 4928.09] - - [1856, 5888, 1, 128] - - [667, 4865.15] + - [729, 4865.15] - - [7680, 2, 1, 2560] - - [603, 499.612] + - [665, 499.612] - - [3584, 1856, 1, 256] - - [693, 7978.38] + - [755, 7978.38] - - [4288, 3584, 1, 1280] - - [693, 7852.26] + - [755, 7852.26] - - [2368, 448, 1, 256] - - [708, 5238.93] + - [770, 5238.93] - - [4288, 256, 1, 3328] - - [679, 6751.34] + - [741, 6751.34] - - [1856, 704, 1, 128] - - [661, 3525.56] + - [723, 3525.56] - - [1408, 64, 1, 256] - - [620, 1884.8] + - [682, 1884.8] - - [64, 1856, 1, 128] - - [597, 888.205] + - [659, 888.205] - - [4, 256, 1, 128] - - [714, 7.38178] + - [776, 7.38178] - - [512, 16, 1, 512] - - [627, 663.756] + - [689, 663.756] - - [704, 5888, 1, 128] - - [661, 4424.55] + - [723, 4424.55] - - [6784, 3584, 1, 128] - - [663, 3823.4] + - [725, 3823.4] - - [1024, 64, 1, 256] - - [605, 1379.81] + - [667, 1379.81] - - [64, 2368, 1, 256] - - [679, 2424.93] + - [741, 2424.93] - - [5124, 1500, 1, 2048] - - [697, 8391.84] + - [759, 8391.84] - - [4288, 5056, 1, 3328] - - [683, 9274.14] + - [745, 9274.14] - - [4, 1856, 1, 1280] - - [627, 453.474] + - [689, 453.474] - - [4288, 128, 1, 128] - - [661, 2157.8] + - [723, 2157.8] - - [512, 2, 1, 500000] - - [571, 516.895] + - [633, 516.895] - - [1408, 1408, 1, 128] - - [662, 3600.49] + - [724, 3600.49] - - [7680, 16, 1, 2560] - - [642, 3542.59] + - [704, 3542.59] - - [1856, 128, 1, 128] - - [594, 1532.8] + - [656, 1532.8] - - [5056, 2368, 1, 256] - - [706, 7684.07] + - [768, 7684.07] - - [4288, 704, 1, 3328] - - [679, 7642.96] + - [741, 7642.96] - - [448, 3584, 1, 256] - - [689, 6734.07] + - [751, 6734.07] - - [2368, 64, 1, 1280] - - [640, 3962.24] + - [702, 3962.24] - - [2368, 1024, 1, 1280] - - [691, 7989.64] + - [753, 7989.64] - - [2944, 1408, 1, 3328] - - [694, 8954.66] + - [756, 8954.66] - - [6144, 1500, 1, 2560] - - [712, 8170.07] + - [774, 8170.07] - - [4224, 1, 1, 128] - - [643, 76.9] + - [705, 76.9] - - [1024, 1408, 1, 3328] - - [709, 6961.38] + - [771, 6961.38] - - [2944, 5888, 1, 1280] - - [690, 8797.53] + - [752, 8797.53] - - [8448, 2, 1, 2816] - - [565, 496.958] + - [627, 496.958] - - [1408, 4, 1, 1280] - - [720, 471.891] + - [782, 471.891] - - [5888, 3584, 1, 256] - - [697, 8246.3] + - [759, 8246.3] - - [2368, 5056, 1, 128] - - [660, 4906.9] + - [722, 4906.9] - - [1408, 1856, 1, 3328] - - [684, 9006.8] + - [746, 9006.8] - - [4, 4, 1, 3328] - - [632, 5.83793] + - [694, 5.83793] - - [5888, 5056, 1, 3328] - - [697, 8545.1] + - [759, 8545.1] - - [7680, 6000, 1, 2560] - - [690, 7996.0] + - [752, 7996.0] - - [6784, 1408, 1, 1280] - - [684, 8888.13] + - [746, 8888.13] - - [4, 1024, 1, 1280] - - [632, 302.109] + - [694, 302.109] - - [512, 3000, 1, 2560] - - [684, 7809.43] + - [746, 7809.43] - - [704, 2944, 1, 256] - - [689, 4909.24] + - [751, 4909.24] - - [4288, 64, 1, 256] - - [689, 3264.72] + - [751, 3264.72] - - [6784, 5888, 1, 3328] - - [697, 9544.52] + - [759, 9544.52] - - [2368, 4288, 1, 128] - - [660, 4873.03] + - [722, 4873.03] - - [64, 4288, 1, 1280] - - [646, 4656.42] + - [708, 4656.42] - - [6784, 64, 1, 1280] - - [679, 6230.43] + - [741, 6230.43] - - [3584, 128, 1, 128] - - [587, 2315.57] + - [649, 2315.57] - - [1024, 6784, 1, 128] - - [661, 3758.94] + - [723, 3758.94] - - [1024, 1500, 1, 1536] - - [710, 6972.0] + - [772, 6972.0] - - [1408, 64, 1, 3328] - - [613, 5079.58] + - [675, 5079.58] - - [6784, 4, 1, 256] - - [599, 487.938] + - [661, 487.938] - - [1408, 1408, 1, 1280] - - [712, 7423.31] + - [774, 7423.31] - - [256, 2368, 1, 256] - - [679, 4986.9] + - [741, 4986.9] - - [3072, 3000, 1, 1024] - - [681, 7844.01] + - [743, 7844.01] - - [448, 4288, 1, 3328] - - [680, 7204.79] + - [742, 7204.79] - - [2368, 1408, 1, 256] - - [712, 5897.96] + - [774, 5897.96] - - [704, 2368, 1, 256] - - [679, 7000.93] + - [741, 7000.93] - - [1024, 24000, 1, 2560] - - [706, 8562.31] + - [768, 8562.31] - - [2944, 448, 1, 1280] - - [694, 7155.93] + - [756, 7155.93] - - [5888, 2368, 1, 3328] - - [693, 9252.42] + - [755, 9252.42] - - [1024, 256, 1, 128] - - [675, 1255.88] + - [737, 1255.88] - - [5124, 9124, 1, 1760] - - [687, 9168.49] + - [749, 9168.49] - - [448, 1408, 1, 1280] - - [679, 6150.34] + - [741, 6150.34] - - [448, 1856, 1, 1280] - - [694, 6489.76] + - [756, 6489.76] - - [4288, 448, 1, 1280] - - [709, 6887.02] + - [771, 6887.02] - - [5888, 704, 1, 3328] - - [689, 8230.64] + - [751, 8230.64] - - [4, 1856, 1, 128] - - [715, 27.0964] + - [777, 27.0964] - - [5056, 256, 1, 128] - - [660, 3469.01] + - [722, 3469.01] - - [1856, 256, 1, 128] - - [661, 2534.16] + - [723, 2534.16] - - [128, 2368, 1, 256] - - [679, 3660.22] + - [741, 3660.22] - - [704, 4, 1, 256] - - [627, 134.596] + - [689, 134.596] - - [1024, 6784, 1, 3328] - - [681, 8482.75] + - [743, 8482.75] - - [1408, 5888, 1, 128] - - [661, 4644.52] + - [723, 4644.52] - - [4288, 4, 1, 128] - - [714, 35.8799] + - [776, 35.8799] - - [512, 3136, 1, 2048] - - [726, 6386.69] + - [788, 6386.69] - - [1408, 1024, 1, 256] - - [679, 5440.82] + - [741, 5440.82] - - [128, 64, 1, 256] - - [609, 380.019] + - [671, 380.019] - - [8448, 1500, 1, 2816] - - [676, 9155.92] + - [738, 9155.92] - - [256, 704, 1, 128] - - [661, 895.623] + - [723, 895.623] - - [2560, 7000, 1, 2560] - - [688, 8565.66] + - [750, 8565.66] - - [5888, 64, 1, 1280] - - [703, 5007.83] + - [765, 5007.83] - - [128, 4, 1, 3328] - - [722, 165.21] + - [784, 165.21] - - [5056, 6784, 1, 1280] - - [687, 9331.48] + - [749, 9331.48] - - [1024, 448, 1, 1280] - - [689, 6501.46] + - [751, 6501.46] - - [704, 5056, 1, 3328] - - [676, 8090.13] + - [738, 8090.13] - - [128, 5056, 1, 256] - - [689, 5537.37] + - [751, 5537.37] - - [3584, 5056, 1, 3328] - - [685, 8633.24] + - [747, 8633.24] - - [1856, 4, 1, 3328] - - [723, 582.814] + - [785, 582.814] - - [4, 2944, 1, 128] - - [714, 114.292] + - [776, 114.292] - - [2368, 2944, 1, 3328] - - [693, 8749.55] + - [755, 8749.55] - - [448, 448, 1, 1280] - - [617, 4694.93] + - [679, 4694.93] - - [128, 4, 1, 128] - - [714, 4.94734] + - [776, 4.94734] - - [2368, 3584, 1, 256] - - [693, 8418.59] + - [755, 8418.59] - - [4608, 3000, 1, 1536] - - [683, 9076.47] + - [745, 9076.47] - - [1024, 256, 1, 1280] - - [689, 5562.84] + - [751, 5562.84] - - [5056, 3584, 1, 1280] - - [683, 8365.09] + - [745, 8365.09] - - [5124, 9124, 1, 4096] - - [693, 8648.58] + - [755, 8648.58] - - [7680, 48000, 1, 2560] - - [687, 4098.26] + - [749, 4098.26] - - [1856, 704, 1, 1280] - - [679, 8141.04] + - [741, 8141.04] - - [1856, 2944, 1, 1280] - - [681, 8214.4] + - [743, 8214.4] - - [4608, 1500, 1, 1536] - - [689, 8424.53] + - [751, 8424.53] - - [1024, 48000, 1, 2816] - - [680, 8513.18] + - [742, 8513.18] - - [5124, 9124, 1, 2560] - - [697, 8641.24] + - [759, 8641.24] - - [128, 1024, 1, 256] - - [611, 2356.45] + - [673, 2356.45] - - [2944, 1408, 1, 256] - - [693, 8254.29] + - [755, 8254.29] - - [4288, 1408, 1, 3328] - - [687, 9138.49] + - [749, 9138.49] - - [3584, 64, 1, 3328] - - [600, 5629.62] + - [662, 5629.62] - - [5888, 2944, 1, 128] - - [661, 4119.33] + - [723, 4119.33] - - [2944, 1024, 1, 128] - - [663, 4002.96] + - [725, 4002.96] - - [128, 1, 1, 1024] - - [657, 20.0805] + - [719, 20.0805] - - [5124, 700, 1, 2048] - - [694, 7653.84] + - [756, 7653.84] - - [4, 4288, 1, 1280] - - [627, 587.749] + - [689, 587.749] - - [6784, 5056, 1, 128] - - [666, 4855.85] + - [728, 4855.85] - - [256, 1024, 1, 3328] - - [689, 6116.28] + - [751, 6116.28] - - [3584, 4, 1, 256] - - [601, 395.576] + - [663, 395.576] - - [1856, 64, 1, 3328] - - [616, 5732.6] + - [678, 5732.6] - - [4, 128, 1, 3328] - - [722, 162.689] + - [784, 162.689] - - [256, 12544, 1, 1024] - - [726, 7628.92] + - [788, 7628.92] - - [5888, 1408, 1, 3328] - - [687, 9524.43] + - [749, 9524.43] - - [448, 2944, 1, 128] - - [661, 3163.91] + - [723, 3163.91] - - [2368, 1856, 1, 256] - - [689, 8167.36] + - [751, 8167.36] - - [256, 5056, 1, 256] - - [679, 7292.13] + - [741, 7292.13] - - [5056, 5056, 1, 128] - - [667, 5043.99] + - [729, 5043.99] - - [448, 3584, 1, 3328] - - [684, 6839.56] + - [746, 6839.56] - - [4, 5056, 1, 3328] - - [632, 639.886] + - [694, 639.886] - - [256, 256, 1, 128] - - [591, 554.902] + - [653, 554.902] - - [5888, 256, 1, 128] - - [663, 3562.47] + - [725, 3562.47] - - [4, 5056, 1, 128] - - [714, 149.907] + - [776, 149.907] - - [448, 256, 1, 256] - - [610, 2121.5] + - [672, 2121.5] - - [704, 4, 1, 3328] - - [720, 455.919] + - [782, 455.919] - - [1408, 256, 1, 256] - - [679, 4352.68] + - [741, 4352.68] - - [3584, 1856, 1, 128] - - [670, 3933.23] + - [732, 3933.23] - - [4288, 4288, 1, 128] - - [661, 4888.61] + - [723, 4888.61] - - [1856, 1024, 1, 3328] - - [697, 8242.64] + - [759, 8242.64] - - [1856, 4288, 1, 128] - - [666, 4647.4] + - [728, 4647.4] - - [1024, 6000, 1, 2560] - - [691, 8526.75] + - [753, 8526.75] - - [1024, 5056, 1, 256] - - [676, 7343.83] + - [738, 7343.83] - - [5056, 5888, 1, 128] - - [665, 4053.5] + - [727, 4053.5] - - [2368, 1408, 1, 3328] - - [679, 8466.2] + - [741, 8466.2] - - [1024, 48000, 1, 1536] - - [697, 9487.74] + - [759, 9487.74] - - [5888, 448, 1, 256] - - [710, 6081.54] + - [772, 6081.54] - - [5888, 6784, 1, 128] - - [662, 4820.27] + - [724, 4820.27] - - [2368, 4, 1, 3328] - - [721, 620.628] + - [783, 620.628] - - [6784, 5056, 1, 1280] - - [706, 8525.5] + - [768, 8525.5] - - [5056, 704, 1, 1280] - - [676, 7933.06] + - [738, 7933.06] - - [1024, 48000, 1, 2560] - - [697, 8877.94] + - [759, 8877.94] - - [4608, 32, 1, 1536] - - [626, 3556.83] + - [688, 3556.83] - - [1024, 2368, 1, 128] - - [669, 2943.75] + - [731, 2943.75] - - [128, 704, 1, 256] - - [610, 2059.8] + - [672, 2059.8] - - [2368, 448, 1, 3328] - - [689, 5290.42] + - [751, 5290.42] - - [128, 5888, 1, 3328] - - [689, 7764.43] + - [751, 7764.43] - - [448, 128, 1, 1280] - - [640, 3373.28] + - [702, 3373.28] - - [6784, 4, 1, 3328] - - [599, 676.063] + - [661, 676.063] - - [4288, 4, 1, 1280] - - [632, 564.775] + - [694, 564.775] - - [1024, 64, 1, 3328] - - [646, 4293.48] + - [708, 4293.48] - - [3072, 48000, 1, 1024] - - [696, 7826.51] + - [758, 7826.51] - - [256, 4, 1, 128] - - [715, 4.93304] + - [777, 4.93304] - - [1024, 5888, 1, 128] - - [674, 3610.46] + - [736, 3610.46] - - [3584, 5888, 1, 128] - - [662, 4722.35] + - [724, 4722.35] - - [5056, 5888, 1, 256] - - [697, 9159.11] + - [759, 9159.11] - - [2368, 1024, 1, 256] - - [689, 7482.71] + - [751, 7482.71] - - [2944, 1856, 1, 256] - - [693, 8209.0] + - [755, 8209.0] - - [1856, 6784, 1, 1280] - - [689, 8205.43] + - [751, 8205.43] - - [64, 5056, 1, 128] - - [584, 2079.35] + - [646, 2079.35] - - [64, 6784, 1, 128] - - [584, 2437.58] + - [646, 2437.58] - - [448, 704, 1, 128] - - [660, 1506.45] + - [722, 1506.45] - - [4, 1024, 1, 128] - - [715, 17.3463] + - [777, 17.3463] - - [1408, 448, 1, 256] - - [679, 5545.45] + - [741, 5545.45] - - [1408, 704, 1, 128] - - [665, 2931.65] + - [727, 2931.65] - - [64, 256, 1, 3328] - - [651, 2816.52] + - [713, 2816.52] - - [8448, 3000, 1, 2816] - - [685, 8872.99] + - [747, 8872.99] - - [6784, 448, 1, 3328] - - [679, 7555.48] + - [741, 7555.48] - - [5056, 1856, 1, 1280] - - [677, 8652.36] + - [739, 8652.36] - - [1408, 1024, 1, 3328] - - [681, 7781.42] + - [743, 7781.42] - - [2368, 256, 1, 3328] - - [685, 5392.06] + - [747, 5392.06] - - [7680, 1500, 1, 2560] - - [683, 8919.72] + - [745, 8919.72] - - [5888, 3584, 1, 1280] - - [683, 9235.85] + - [745, 9235.85] - - [1856, 3584, 1, 3328] - - [694, 8348.83] + - [756, 8348.83] - - [5888, 128, 1, 1280] - - [679, 5928.61] + - [741, 5928.61] - - [1024, 2944, 1, 256] - - [710, 6630.27] + - [772, 6630.27] - - [448, 6784, 1, 1280] - - [691, 8332.45] + - [753, 8332.45] - - [256, 3584, 1, 1280] - - [681, 7140.19] + - [743, 7140.19] - - [448, 128, 1, 128] - - [583, 552.813] + - [645, 552.813] - - [704, 5056, 1, 256] - - [689, 7959.68] + - [751, 7959.68] - - [3584, 1024, 1, 3328] - - [681, 8386.84] + - [743, 8386.84] - - [2944, 1856, 1, 1280] - - [697, 7670.29] + - [759, 7670.29] - - [128, 256, 1, 128] - - [598, 258.37] + - [660, 258.37] - - [5056, 256, 1, 256] - - [689, 5736.77] + - [751, 5736.77] - - [2944, 4288, 1, 3328] - - [676, 8730.8] + - [738, 8730.8] - - [2368, 3584, 1, 3328] - - [678, 8437.71] + - [740, 8437.71] - - [2944, 704, 1, 1280] - - [689, 8342.53] + - [751, 8342.53] - - [128, 4, 1, 256] - - [609, 24.9242] + - [671, 24.9242] - - [2944, 3584, 1, 1280] - - [691, 8322.11] + - [753, 8322.11] - - [1856, 5888, 1, 1280] - - [676, 8911.91] + - [738, 8911.91] - - [256, 256, 1, 1280] - - [640, 3653.67] + - [702, 3653.67] - - [4608, 24000, 1, 1536] - - [690, 8931.06] + - [752, 8931.06] - - [4288, 1408, 1, 256] - - [677, 8338.45] + - [739, 8338.45] - - [3584, 64, 1, 256] - - [689, 3414.07] + - [751, 3414.07] - - [64, 1856, 1, 3328] - - [616, 5460.23] + - [678, 5460.23] - - [256, 1408, 1, 128] - - [660, 1424.09] + - [722, 1424.09] - - [5888, 1408, 1, 128] - - [671, 4177.88] + - [733, 4177.88] - - [4288, 2368, 1, 1280] - - [680, 8596.05] + - [742, 8596.05] - - [4, 4288, 1, 256] - - [716, 370.954] + - [778, 370.954] - - [256, 4288, 1, 128] - - [661, 2907.99] + - [723, 2907.99] - - [256, 128, 1, 3328] - - [654, 3644.88] + - [716, 3644.88] - - [512, 8, 1, 500000] - - [566, 2025.89] + - [628, 2025.89] - - [6784, 2368, 1, 256] - - [679, 8470.41] + - [741, 8470.41] - - [5888, 128, 1, 128] - - [584, 2604.55] + - [646, 2604.55] - - [1408, 448, 1, 3328] - - [689, 6540.62] + - [751, 6540.62] - - [1024, 24000, 1, 2816] - - [706, 8364.03] + - [768, 8364.03] - - [704, 1024, 1, 1280] - - [689, 7277.28] + - [751, 7277.28] - - [1856, 256, 1, 3328] - - [679, 7039.14] + - [741, 7039.14] - - [1856, 2944, 1, 256] - - [688, 8151.59] + - [750, 8151.59] - - [5056, 1024, 1, 128] - - [662, 4422.82] + - [724, 4422.82] - - [64, 5888, 1, 1280] - - [640, 4854.62] + - [702, 4854.62] - - [7680, 3000, 1, 2560] - - [693, 8789.57] + - [755, 8789.57] - - [4224, 1500, 1, 176] - - [689, 7902.14] + - [751, 7902.14] - - [5124, 700, 1, 2560] - - [679, 8232.59] + - [741, 8232.59] - - [6784, 256, 1, 128] - - [660, 3548.92] + - [722, 3548.92] - - [5888, 704, 1, 128] - - [667, 3959.65] + - [729, 3959.65] - - [6784, 64, 1, 128] - - [595, 2150.82] + - [657, 2150.82] - - [4, 448, 1, 1280] - - [720, 268.063] + - [782, 268.063] - - [1024, 4288, 1, 1280] - - [694, 8363.72] + - [756, 8363.72] - - [2368, 5056, 1, 3328] - - [693, 8581.85] + - [755, 8581.85] - - [448, 4, 1, 128] - - [714, 16.8673] + - [776, 16.8673] - - [4, 256, 1, 3328] - - [723, 201.988] + - [785, 201.988] - - [4288, 1024, 1, 3328] - - [689, 8567.72] + - [751, 8567.72] - - [6144, 48000, 1, 2560] - - [697, 3751.68] + - [759, 3751.68] - - [1024, 5056, 1, 3328] - - [676, 9440.66] + - [738, 9440.66] - - [1024, 1856, 1, 3328] - - [697, 8244.36] + - [759, 8244.36] - - [704, 704, 1, 1280] - - [689, 5529.99] + - [751, 5529.99] - - [128, 2368, 1, 1280] - - [646, 5062.38] + - [708, 5062.38] - - [3584, 4, 1, 128] - - [715, 61.5949] + - [777, 61.5949] - - [3584, 256, 1, 1280] - - [713, 6260.24] + - [775, 6260.24] - - [4, 128, 1, 128] - - [714, 1.2587] + - [776, 1.2587] - - [128, 4288, 1, 3328] - - [625, 6186.15] + - [687, 6186.15] - - [5124, 1500, 1, 2560] - - [693, 8432.62] + - [755, 8432.62] - - [3584, 128, 1, 1280] - - [679, 6547.85] + - [741, 6547.85] - - [4, 256, 1, 1280] - - [632, 180.144] + - [694, 180.144] - - [128, 704, 1, 3328] - - [604, 5177.81] + - [666, 5177.81] - - [4288, 6784, 1, 256] - - [677, 9005.34] + - [739, 9005.34] - - [3584, 2944, 1, 3328] - - [694, 8872.27] + - [756, 8872.27] - - [128, 1856, 1, 256] - - [679, 3690.48] + - [741, 3690.48] - - [64, 4288, 1, 256] - - [679, 3007.57] + - [741, 3007.57] - - [4, 3584, 1, 3328] - - [609, 639.99] + - [671, 639.99] - - [64, 4, 1, 3328] - - [723, 98.7074] + - [785, 98.7074] - - [4, 64, 1, 3328] - - [723, 91.9069] + - [785, 91.9069] - - [35, 700, 1, 2560] - - [577, 2397.65] + - [639, 2397.65] - - [5888, 2944, 1, 256] - - [687, 9031.28] + - [749, 9031.28] - - [4, 2368, 1, 256] - - [627, 256.968] + - [689, 256.968] - - [1856, 64, 1, 256] - - [611, 2222.96] + - [673, 2222.96] - - [5056, 128, 1, 1280] - - [679, 6557.85] + - [741, 6557.85] - - [448, 4288, 1, 1280] - - [703, 6891.66] + - [765, 6891.66] - - [256, 4288, 1, 256] - - [679, 6250.51] + - [741, 6250.51] - - [1024, 4288, 1, 128] - - [663, 3951.41] + - [725, 3951.41] - - [4, 1024, 1, 256] - - [627, 182.144] + - [689, 182.144] - - [5056, 4288, 1, 256] - - [683, 8933.43] + - [745, 8933.43] - - [1024, 448, 1, 256] - - [689, 4573.33] + - [751, 4573.33] - - [1024, 3584, 1, 256] - - [684, 7447.18] + - [746, 7447.18] - - [2944, 128, 1, 1280] - - [689, 5417.27] + - [751, 5417.27] - - [49, 2048, 64, 512] - - [732, 5916.91] + - [794, 5916.91] - - [2560, 32, 1, 2560] - - [626, 4076.99] + - [688, 4076.99] - - [64, 256, 1, 256] - - [643, 689.953] + - [705, 689.953] - - [1024, 4, 1, 512] - - [635, 288.17] + - [697, 288.17] - - [128, 2368, 1, 128] - - [589, 1809.68] + - [651, 1809.68] - - [256, 704, 1, 1280] - - [679, 4033.08] + - [741, 4033.08] - - [64, 2368, 1, 128] - - [580, 1165.88] + - [642, 1165.88] - - [176, 1500, 1, 1408] - - [607, 4922.13] + - [669, 4922.13] - - [448, 5888, 1, 1280] - - [689, 7550.21] + - [751, 7550.21] - - [512, 3000, 1, 2048] - - [711, 6562.44] + - [773, 6562.44] - - [5056, 448, 1, 128] - - [661, 3947.97] + - [723, 3947.97] - - [4288, 704, 1, 1280] - - [679, 8243.82] + - [741, 8243.82] - - [3584, 2944, 1, 128] - - [671, 4284.88] + - [733, 4284.88] - - [6784, 256, 1, 1280] - - [679, 7955.21] + - [741, 7955.21] - - [256, 2944, 1, 1280] - - [709, 6691.9] + - [771, 6691.9] - - [2560, 128, 1, 2560] - - [647, 5347.23] + - [709, 5347.23] - - [2368, 5888, 1, 3328] - - [684, 8919.07] + - [746, 8919.07] - - [4, 64, 1, 256] - - [632, 13.1032] + - [694, 13.1032] - - [704, 1024, 1, 3328] - - [709, 6648.12] + - [771, 6648.12] - - [2368, 1856, 1, 1280] - - [695, 8016.51] + - [757, 8016.51] - - [448, 5056, 1, 3328] - - [679, 8231.73] + - [741, 8231.73] - - [128, 448, 1, 128] - - [588, 441.208] + - [650, 441.208] - - [128, 6784, 1, 256] - - [689, 5850.05] + - [751, 5850.05] - - [512, 4, 1, 500000] - - [569, 1027.14] + - [631, 1027.14] - - [3584, 4288, 1, 128] - - [665, 4260.9] + - [727, 4260.9] - - [64, 448, 1, 128] - - [588, 253.554] + - [650, 253.554] - - [1024, 6000, 1, 2816] - - [693, 8886.14] + - [755, 8886.14] - - [5888, 4288, 1, 3328] - - [693, 8968.16] + - [755, 8968.16] - - [2368, 704, 1, 256] - - [709, 4663.24] + - [771, 4663.24] - - [256, 1856, 1, 3328] - - [681, 6480.63] + - [743, 6480.63] - - [1856, 128, 1, 256] - - [679, 3726.66] + - [741, 3726.66] - - [6784, 128, 1, 128] - - [582, 2824.01] + - [644, 2824.01] - - [3584, 1408, 1, 128] - - [665, 3666.78] + - [727, 3666.78] - - [1856, 5056, 1, 1280] - - [676, 8651.36] + - [738, 8651.36] - - [2944, 1024, 1, 1280] - - [687, 8765.21] + - [749, 8765.21] - - [5056, 4, 1, 256] - - [601, 428.688] + - [663, 428.688] - - [3584, 5888, 1, 3328] - - [687, 9347.75] + - [749, 9347.75] - - [2368, 4288, 1, 256] - - [697, 8013.1] + - [759, 8013.1] - - [1024, 2368, 1, 3328] - - [684, 8119.29] + - [746, 8119.29] - - [128, 3584, 1, 128] - - [584, 2584.62] + - [646, 2584.62] - - [704, 1408, 1, 256] - - [689, 6792.27] + - [751, 6792.27] - - [4096, 128, 1, 4096] - - [711, 6624.84] + - [773, 6624.84] - - [1024, 2944, 1, 128] - - [663, 3771.37] + - [725, 3771.37] - - [1024, 3584, 1, 1280] - - [684, 8952.71] + - [746, 8952.71] - - [4288, 5888, 1, 3328] - - [697, 9048.05] + - [759, 9048.05] - - [4288, 4, 1, 3328] - - [602, 615.206] + - [664, 615.206] - - [4608, 16, 1, 1536] - - [606, 2894.94] + - [668, 2894.94] - - [5888, 64, 1, 128] - - [593, 1827.16] + - [655, 1827.16] - - [4, 5888, 1, 128] - - [714, 179.544] + - [776, 179.544] - - [1024, 2944, 1, 3328] - - [685, 8298.77] + - [747, 8298.77] - - [2048, 64, 1, 2048] - - [614, 4963.77] + - [676, 4963.77] - - [6144, 2, 1, 2560] - - [603, 477.88] + - [665, 477.88] - - [256, 6784, 1, 1280] - - [677, 7491.94] + - [739, 7491.94] - - [1856, 3584, 1, 256] - - [689, 7580.6] + - [751, 7580.6] - - [128, 448, 1, 3328] - - [640, 4417.71] + - [702, 4417.71] - - [6784, 1856, 1, 128] - - [668, 4621.74] + - [730, 4621.74] - - [1024, 1500, 1, 2048] - - [689, 6284.5] + - [751, 6284.5] - - [5056, 128, 1, 256] - - [689, 5705.16] + - [751, 5705.16] - - [512, 24000, 1, 2816] - - [676, 8919.85] + - [738, 8919.85] - - [256, 5888, 1, 1280] - - [691, 7978.0] + - [753, 7978.0] - - [4, 128, 1, 1280] - - [632, 94.2609] + - [694, 94.2609] - - [4288, 6784, 1, 3328] - - [697, 9012.58] + - [759, 9012.58] - - [6784, 128, 1, 1280] - - [681, 6807.35] + - [743, 6807.35] - - [64, 1408, 1, 256] - - [610, 2045.19] + - [672, 2045.19] - - [2368, 1408, 1, 128] - - [661, 4340.73] + - [723, 4340.73] - - [1856, 448, 1, 256] - - [710, 3639.99] + - [772, 3639.99] - - [1408, 1024, 1, 128] - - [669, 3417.68] + - [731, 3417.68] - - [128, 64, 1, 128] - - [590, 68.7241] + - [652, 68.7241] - - [6784, 3584, 1, 3328] - - [687, 9425.63] + - [749, 9425.63] - - [1760, 7000, 1, 1760] - - [684, 8780.41] + - [746, 8780.41] - - [1024, 704, 1, 3328] - - [701, 5644.6] + - [763, 5644.6] - - [64, 64, 1, 128] - - [580, 38.2023] + - [642, 38.2023] - - [2368, 5056, 1, 1280] - - [698, 8462.41] + - [760, 8462.41] - - [64, 4, 1, 1280] - - [632, 46.6455] + - [694, 46.6455] - - [1408, 2368, 1, 1280] - - [684, 8235.08] + - [746, 8235.08] - - [128, 1408, 1, 1280] - - [646, 4491.66] + - [708, 4491.66] - - [1024, 1, 1, 512] - - [650, 82.02] + - [712, 82.02] - - [4, 1408, 1, 128] - - [714, 56.42] + - [776, 56.42] - - [704, 4288, 1, 128] - - [668, 3942.96] + - [730, 3942.96] - - [128, 1856, 1, 3328] - - [634, 6111.93] + - [696, 6111.93] - - [2944, 2944, 1, 256] - - [693, 8640.22] + - [755, 8640.22] - - [2944, 4, 1, 1280] - - [627, 554.265] + - [689, 554.265] - - [5888, 4, 1, 256] - - [609, 435.744] + - [671, 435.744] - - [6784, 256, 1, 256] - - [689, 7025.96] + - [751, 7025.96] - - [256, 5056, 1, 3328] - - [689, 8249.57] + - [751, 8249.57] - - [128, 4288, 1, 1280] - - [679, 5561.74] + - [741, 5561.74] - - [5056, 1856, 1, 128] - - [673, 3975.28] + - [735, 3975.28] - - [1024, 3000, 1, 1536] - - [694, 8544.54] + - [756, 8544.54] - - [5056, 1024, 1, 3328] - - [687, 9361.47] + - [749, 9361.47] - - [128, 128, 1, 256] - - [639, 699.151] + - [701, 699.151] - - [1760, 64, 1, 1760] - - [607, 4956.26] + - [669, 4956.26] - - [4288, 3584, 1, 3328] - - [707, 7506.18] + - [769, 7506.18] - - [448, 704, 1, 3328] - - [679, 4697.66] + - [741, 4697.66] - - [448, 448, 1, 128] - - [596, 1249.62] + - [658, 1249.62] - - [1024, 2368, 1, 1280] - - [689, 7756.44] + - [751, 7756.44] - - [1856, 704, 1, 3328] - - [689, 8340.66] + - [751, 8340.66] - - [512, 1500, 1, 2560] - - [691, 6041.39] + - [753, 6041.39] - - [5888, 6784, 1, 3328] - - [687, 9199.38] + - [749, 9199.38] - - [704, 4288, 1, 1280] - - [681, 8342.06] + - [743, 8342.06] - - [128, 50176, 1, 512] - - [727, 7589.48] + - [789, 7589.48] - - [704, 256, 1, 256] - - [679, 2912.81] + - [741, 2912.81] - - [1024, 48000, 1, 2048] - - [684, 8947.42] + - [746, 8947.42] - - [4288, 1024, 1, 128] - - [660, 4291.75] + - [722, 4291.75] - - [3136, 64, 128, 64] - - [742, 8175.16] + - [804, 8175.16] - - [784, 512, 64, 128] - - [740, 8378.44] + - [802, 8378.44] - - [3136, 256, 64, 64] - - [743, 8506.75] + - [805, 8506.75] - - [12544, 1024, 1, 256] - - [736, 8928.03] + - [798, 8928.03] - - [784, 128, 128, 512] - - [741, 8190.63] + - [803, 8190.63] - - [784, 512, 256, 128] - - [739, 8637.24] + - [801, 8637.24] - - [3136, 64, 64, 256] - - [738, 8783.03] + - [800, 8783.03] - - [3136, 512, 1, 2048] - - [735, 7298.42] + - [797, 7298.42] - - [12544, 256, 1, 1024] - - [747, 7667.35] + - [809, 7667.35] - - [3136, 2048, 1, 512] - - [746, 8447.32] + - [808, 8447.32] - - [3136, 256, 256, 64] - - [739, 8663.18] + - [801, 8663.18] - - [3136, 64, 128, 256] - - [737, 8943.56] + - [799, 8943.56] - - [784, 128, 64, 512] - - [745, 8006.37] + - [807, 8006.37] - - [3136, 64, 256, 64] - - [742, 8267.22] + - [804, 8267.22] - - [784, 512, 128, 128] - - [739, 8564.35] + - [801, 8564.35] - - [3136, 64, 64, 64] - - [742, 8009.45] + - [804, 8009.45] - - [784, 128, 256, 512] - - [743, 8377.16] + - [805, 8377.16] - - [3136, 64, 256, 256] - - [744, 9033.98] + - [806, 9033.98] - - [3136, 256, 128, 64] - - [739, 8624.56] + - [801, 8624.56] - - [1024, 256, 1, 1024] - - [765, 6331.13] + - [827, 6331.13] - - [1024, 512, 1, 2048] - - [764, 8100.14] + - [826, 8100.14] - - [512, 200, 1, 512] - - [773, 2861.93] + - [835, 2861.93] - - [4096, 256, 1, 2048] - - [756, 8812.82] + - [818, 8812.82] - - [4096, 512, 1, 1024] - - [766, 9068.87] + - [828, 9068.87] - - [1024, 200, 1, 1024] - - [765, 5110.12] + - [827, 5110.12] - - [1024, 512, 1, 1024] - - [758, 7785.35] + - [820, 7785.35] - - [2048, 256, 1, 4096] - - [768, 8438.81] + - [830, 8438.81] - - [2048, 768, 1, 512] - - [750, 8618.53] + - [812, 8618.53] - - [512, 256, 1, 1024] - - [770, 4835.03] + - [832, 4835.03] - - [512, 768, 1, 2048] - - [767, 6909.04] + - [829, 6909.04] - - [2048, 256, 1, 1024] - - [763, 7941.98] + - [825, 7941.98] - - [1024, 256, 1, 2048] - - [760, 6997.9] + - [822, 6997.9] - - [2048, 200, 1, 512] - - [763, 5649.76] + - [825, 5649.76] - - [4096, 200, 1, 1024] - - [761, 6678.93] + - [823, 6678.93] - - [2048, 200, 1, 4096] - - [769, 6706.69] + - [831, 6706.69] - - [2048, 512, 1, 1024] - - [766, 8549.0] + - [828, 8549.0] - - [1024, 1024, 1, 512] - - [761, 8046.73] + - [823, 8046.73] - - [1024, 200, 1, 4096] - - [760, 5884.36] + - [822, 5884.36] - - [2048, 512, 1, 4096] - - [771, 8995.94] + - [833, 8995.94] - - [4096, 512, 1, 2048] - - [766, 9298.18] - - - [512, 256, 1, 2048] - - [759, 5186.26] + - [828, 9298.18] - - [4096, 1024, 1, 2048] - - [748, 9790.77] + - [810, 9790.77] - - [2048, 1024, 1, 2048] - - [749, 9278.9] + - [811, 9278.9] - - [1024, 200, 1, 512] - - [765, 4535.46] + - [827, 4535.46] - - [1024, 1024, 1, 4096] - - [756, 8967.39] + - [818, 8967.39] - - [2048, 1024, 1, 4096] - - [751, 9500.56] + - [813, 9500.56] - - [4096, 200, 1, 2048] - - [757, 7082.68] + - [819, 7082.68] - - [2048, 200, 1, 1024] - - [763, 6212.04] + - [825, 6212.04] - - [1024, 768, 1, 512] - - [764, 7401.81] + - [826, 7401.81] - - [2048, 512, 1, 512] - - [761, 8124.66] + - [823, 8124.66] - - [2048, 200, 1, 2048] - - [763, 6561.9] + - [825, 6561.9] - - [2048, 256, 1, 2048] - - [764, 8224.23] + - [826, 8224.23] - - [512, 768, 1, 512] - - [762, 6469.46] + - [824, 6469.46] - - [512, 200, 1, 1024] - - [765, 3755.74] + - [827, 3755.74] - - [4096, 1024, 1, 1024] - - [748, 9605.95] + - [810, 9605.95] - - [4096, 256, 1, 4096] - - [771, 8961.39] + - [833, 8961.39] - - [1024, 512, 1, 512] - - [764, 7109.09] + - [826, 7109.09] - - [512, 256, 1, 512] - - [772, 4033.08] + - [834, 4033.08] - - [1024, 256, 1, 4096] - - [760, 7326.4] + - [822, 7326.4] - - [4096, 512, 1, 4096] - - [752, 9472.07] + - [814, 9472.07] - - [1024, 200, 1, 2048] - - [753, 5530.56] + - [815, 5530.56] - - [2048, 1024, 1, 512] - - [754, 8995.93] + - [816, 8995.93] - - [1024, 1024, 1, 2048] - - [761, 8830.21] + - [823, 8830.21] - - [4096, 256, 1, 1024] - - [761, 8581.8] + - [823, 8581.8] - - [512, 768, 1, 1024] - - [762, 6876.01] + - [824, 6876.01] - - [1024, 512, 1, 4096] - - [758, 8484.15] + - [820, 8484.15] - - [1024, 256, 1, 512] - - [755, 5668.08] + - [817, 5668.08] - - [4096, 200, 1, 4096] - - [768, 7018.69] + - [830, 7018.69] - - [2048, 256, 1, 512] - - [768, 7079.09] + - [830, 7079.09] - - [512, 200, 1, 2048] - - [773, 4283.5] + - [835, 4283.5] - - [1024, 1024, 1, 1024] - - [756, 8565.37] + - [818, 8565.37] - - [2048, 512, 1, 2048] - - [756, 8850.59] + - [818, 8850.59] - - [4096, 1024, 1, 4096] - - [749, 9843.28] + - [811, 9843.28] - - [2048, 1024, 1, 1024] - - [754, 9234.21] + - [816, 9234.21] - - [4096, 384, 1, 2048] - - [796, 8892.62] + - [858, 8892.62] - - [4096, 192, 1, 2048] - - [790, 8024.28] + - [852, 8024.28] - - [289, 160, 64, 768] - - [792, 6783.73] + - [854, 6783.73] - - [1225, 192, 64, 384] - - [779, 9373.93] + - [841, 9373.93] - - [5329, 64, 64, 160] - - [783, 9186.79] + - [845, 9186.79] - - [1225, 64, 64, 288] - - [774, 8492.51] + - [836, 8492.51] - - [1225, 64, 64, 384] - - [778, 8735.86] + - [840, 8735.86] - - [289, 128, 64, 1024] - - [793, 7000.3] + - [855, 7000.3] - - [4096, 320, 1, 1280] - - [798, 8302.36] + - [860, 8302.36] - - [4096, 384, 1, 1536] - - [780, 9052.55] + - [842, 9052.55] - - [4096, 192, 1, 1280] - - [795, 7561.95] + - [857, 7561.95] - - [289, 192, 64, 768] - - [791, 7882.6] + - [853, 7882.6] - - [1225, 48, 64, 256] - - [782, 6620.35] + - [844, 6620.35] - - [289, 192, 64, 1024] - - [789, 7347.09] + - [851, 7347.09] - - [1225, 64, 64, 192] - - [775, 8098.45] + - [837, 8098.45] - - [1225, 96, 64, 384] - - [776, 8303.18] + - [838, 8303.18] - - [1225, 48, 64, 288] - - [784, 6746.87] + - [846, 6746.87] - - [4096, 320, 1, 2048] - - [785, 8384.52] + - [847, 8384.52] - - [4096, 256, 1, 1536] - - [797, 8734.44] + - [859, 8734.44] - - [1225, 48, 64, 192] - - [784, 6516.46] + - [846, 6516.46] - - [4096, 384, 1, 1280] - - [794, 9023.34] + - [856, 9023.34] - - [1225, 64, 64, 256] - - [781, 8319.44] + - [843, 8319.44] - - [4096, 448, 1, 1280] - - [785, 8343.42] + - [847, 8343.42] - - [289, 128, 64, 768] - - [787, 7668.08] + - [849, 7668.08] - - [289, 256, 64, 1024] - - [788, 7535.56] + - [850, 7535.56] - - [4096, 448, 1, 2048] - - [785, 8572.41] + - [847, 8572.41] - - [5329, 80, 64, 64] - - [784, 6492.54] + - [846, 6492.54] - - [1225, 32, 64, 192] - - [777, 6278.64] + - [839, 6278.64] - - [289, 384, 64, 1024] - - [786, 7767.67] + - [848, 7767.67] - - [1024, 3594, 1, 4096] - - [805, 8661.52] + - [867, 8661.52] - - [4096, 3103, 1, 1024] - - [815, 9652.23] + - [877, 9652.23] - - [4096, 3136, 1, 1024] - - [799, 9723.15] + - [861, 9723.15] - - [1024, 3141, 1, 4096] - - [817, 8612.12] + - [879, 8612.12] - - [64, 147, 432, 148] - - [832, 6372.03] + - [894, 6372.03] - - [4096, 3559, 1, 1024] - - [804, 9906.35] + - [866, 9906.35] - - [4096, 3368, 1, 1024] - - [799, 9721.01] + - [861, 9721.01] - - [1024, 3335, 1, 4096] - - [823, 8990.29] + - [885, 8990.29] - - [1024, 3510, 1, 4096] - - [823, 9440.68] + - [885, 9440.68] - - [4096, 3209, 1, 1024] - - [804, 9632.76] + - [866, 9632.76] - - [4096, 3322, 1, 1024] - - [803, 9939.52] + - [865, 9939.52] - - [1024, 3400, 1, 4096] - - [822, 9156.09] + - [884, 9156.09] - - [1024, 3995, 1, 4096] - - [805, 9610.25] + - [867, 9610.25] - - [1024, 3503, 1, 4096] - - [823, 9446.57] + - [885, 9446.57] - - [4096, 3594, 1, 1024] - - [814, 9691.96] + - [876, 9691.96] - - [4096, 3473, 1, 1024] - - [803, 9698.9] + - [865, 9698.9] - - [4096, 3522, 1, 1024] - - [804, 9816.92] + - [866, 9816.92] - - [1024, 3103, 1, 4096] - - [801, 8491.05] + - [863, 8491.05] - - [1024, 3214, 1, 4096] - - [822, 8667.67] + - [884, 8667.67] - - [4096, 3449, 1, 1024] - - [814, 9795.71] + - [876, 9795.71] - - [1024, 3136, 1, 4096] - - [823, 8500.61] + - [885, 8500.61] - - [1024, 3955, 1, 33708] - - [803, 9634.94] + - [865, 9634.94] - - [1024, 3780, 1, 4096] - - [806, 9088.88] + - [868, 9088.88] - - [1024, 3906, 1, 33708] - - [804, 9515.46] + - [866, 9515.46] - - [1024, 3386, 1, 4096] - - [823, 9116.05] + - [885, 9116.05] - - [4096, 3396, 1, 1024] - - [814, 9665.6] + - [876, 9665.6] - - [1024, 3183, 1, 4096] - - [801, 8662.94] + - [863, 8662.94] - - [1024, 3098, 1, 4096] - - [817, 8490.22] + - [879, 8490.22] - - [1024, 3548, 1, 4096] - - [823, 9555.63] + - [885, 9555.63] - - [1024, 3224, 1, 4096] - - [816, 8760.88] + - [878, 8760.88] - - [4096, 3469, 1, 1024] - - [803, 9687.21] + - [865, 9687.21] - - [1024, 3582, 1, 4096] - - [820, 9691.0] + - [882, 9691.0] - - [1024, 2977, 1, 4096] - - [805, 9379.38] + - [867, 9379.38] - - [1024, 3939, 1, 1024] - - [802, 9172.11] + - [864, 9172.11] - - [64, 123, 528, 123] - - [850, 6346.17] + - [912, 6346.17] - - [64, 12, 5040, 12] - - [827, 1536.1] + - [889, 1536.1] - - [4096, 3176, 1, 1024] - - [815, 9712.2] + - [877, 9712.2] - - [1024, 3559, 1, 4096] - - [819, 9579.84] + - [881, 9579.84] - - [1024, 3478, 1, 4096] - - [823, 9373.85] + - [885, 9373.85] - - [4096, 3343, 1, 1024] - - [799, 9638.77] + - [861, 9638.77] - - [4096, 3440, 1, 1024] - - [799, 9853.96] + - [861, 9853.96] - - [1024, 3996, 1, 33708] - - [803, 9733.55] + - [865, 9733.55] - - [1024, 4012, 1, 4096] - - [804, 9636.99] + - [866, 9636.99] - - [1024, 3322, 1, 4096] - - [823, 8945.12] + - [885, 8945.12] - - [1024, 3990, 1, 33708] - - [803, 9720.31] + - [865, 9720.31] - - [1024, 3314, 1, 4096] - - [823, 8944.72] + - [885, 8944.72] - - [4096, 3513, 1, 1024] - - [803, 9794.95] + - [865, 9794.95] - - [1024, 3562, 1, 4096] - - [823, 9597.28] + - [885, 9597.28] - - [1024, 3443, 1, 4096] - - [823, 9279.52] + - [885, 9279.52] - - [1024, 3554, 1, 4096] - - [820, 9552.16] + - [882, 9552.16] - - [1024, 3063, 1, 4096] - - [805, 9622.58] + - [867, 9622.58] - - [64, 111, 576, 112] - - [850, 6274.65] + - [912, 6274.65] - - [4096, 3460, 1, 1024] - - [803, 9665.69] + - [865, 9665.69] - - [1024, 3209, 1, 4096] - - [802, 8708.39] + - [864, 8708.39] - - [1024, 3147, 1, 4096] - - [823, 8492.23] + - [885, 8492.23] - - [4096, 3387, 1, 1024] - - [800, 9761.34] + - [862, 9761.34] - - [4096, 3436, 1, 1024] - - [799, 9815.15] + - [861, 9815.15] - - [1024, 3341, 1, 4096] - - [822, 9005.07] + - [884, 9005.07] - - [1024, 3516, 1, 4096] - - [822, 9471.39] + - [884, 9471.39] - - [4096, 3277, 1, 1024] - - [803, 9807.12] + - [865, 9807.12] - - [1024, 3454, 1, 4096] - - [823, 9301.03] + - [885, 9301.03] - - [1024, 3969, 1, 4096] - - [803, 9539.82] + - [865, 9539.82] - - [1024, 3999, 1, 4096] - - [804, 9607.52] + - [866, 9607.52] - - [1024, 4032, 1, 4096] - - [805, 9693.47] + - [867, 9693.47] - - [4096, 3541, 1, 1024] - - [804, 9866.73] + - [866, 9866.73] - - [4096, 3334, 1, 1024] - - [815, 9614.41] + - [877, 9614.41] - - [1024, 3365, 1, 4096] - - [823, 9058.58] + - [885, 9058.58] - - [1024, 3527, 1, 4096] - - [823, 9510.31] + - [885, 9510.31] - - [1024, 3190, 1, 4096] - - [822, 8627.8] + - [884, 8627.8] - - [4096, 3906, 1, 1024] - - [800, 9817.78] + - [862, 9817.78] - - [1024, 3593, 1, 4096] - - [805, 8663.09] + - [867, 8663.09] - - [1024, 3336, 1, 4096] - - [823, 8991.13] + - [885, 8991.13] - - [4096, 3504, 1, 1024] - - [803, 9769.86] + - [865, 9769.86] - - [4096, 3977, 1, 1024] - - [804, 9742.62] + - [866, 9742.62] - - [1024, 3906, 1, 4096] - - [804, 9386.25] + - [866, 9386.25] - - [4096, 3415, 1, 1024] - - [814, 9802.7] + - [876, 9802.7] - - [1024, 3295, 1, 4096] - - [822, 8879.26] + - [884, 8879.26] - - [4096, 3321, 1, 1024] - - [804, 9931.43] + - [866, 9931.43] - - [1024, 3072, 1, 4096] - - [805, 9671.71] + - [867, 9671.71] - - [1024, 3408, 1, 4096] - - [822, 9182.83] + - [884, 9182.83] - - [1024, 3522, 1, 4096] - - [823, 9484.63] + - [885, 9484.63] - - [4096, 3751, 1, 1024] - - [804, 9778.86] + - [866, 9778.86] - - [4096, 3378, 1, 1024] - - [814, 9692.77] + - [876, 9692.77] - - [64, 77, 816, 77] - - [856, 4850.29] + - [918, 4850.29] - - [1024, 3925, 1, 33708] - - [803, 9560.88] + - [865, 9560.88] - - [1024, 3990, 1, 1024] - - [805, 9272.75] + - [867, 9272.75] - - [1024, 3290, 1, 4096] - - [816, 8905.61] + - [878, 8905.61] - - [4096, 3500, 1, 1024] - - [804, 9761.82] + - [866, 9761.82] - - [4096, 3565, 1, 1024] - - [803, 9919.37] + - [865, 9919.37] - - [1024, 3484, 1, 4096] - - [822, 9376.52] + - [884, 9376.52] - - [4096, 3395, 1, 1024] - - [815, 9788.16] + - [877, 9788.16] - - [64, 92, 688, 92] - - [842, 5606.1] + - [904, 5606.1] - - [1024, 3681, 1, 1024] - - [807, 8690.23] + - [869, 8690.23] - - [64, 159, 400, 159] - - [834, 6518.97] + - [896, 6518.97] - - [1024, 3584, 1, 1024] - - [822, 9365.37] + - [884, 9365.37] - - [4096, 3093, 1, 1024] - - [814, 9623.41] + - [876, 9623.41] - - [1024, 4050, 1, 1024] - - [806, 9354.14] + - [868, 9354.14] - - [1024, 3301, 1, 4096] - - [823, 8889.04] + - [885, 8889.04] - - [1024, 3581, 1, 4096] - - [822, 9673.82] + - [884, 9673.82] - - [4096, 3374, 1, 1024] - - [815, 9707.33] + - [877, 9707.33] - - [1024, 3449, 1, 4096] - - [823, 9270.9] + - [885, 9270.9] - - [4096, 3215, 1, 1024] - - [804, 9645.25] + - [866, 9645.25] - - [4096, 3312, 1, 1024] - - [804, 9888.72] + - [866, 9888.72] - - [4096, 3479, 1, 1024] - - [804, 9698.61] + - [866, 9698.61] - - [4096, 3544, 1, 1024] - - [804, 9875.09] + - [866, 9875.09] - - [1024, 3263, 1, 4096] - - [823, 8787.61] + - [885, 8787.61] - - [4096, 3455, 1, 1024] - - [814, 9845.29] + - [876, 9845.29] - - [1024, 3379, 1, 4096] - - [820, 9100.01] + - [882, 9100.01] - - [1024, 3490, 1, 4096] - - [823, 9397.49] + - [885, 9397.49] - - [1024, 3368, 1, 4096] - - [823, 9079.25] + - [885, 9079.25] - - [4096, 3186, 1, 1024] - - [799, 9750.17] + - [861, 9750.17] - - [1024, 3428, 1, 4096] - - [823, 9232.92] + - [885, 9232.92] - - [64, 85, 752, 84] - - [838, 5342.67] + - [900, 5342.67] - - [4096, 3561, 1, 1024] - - [804, 9914.02] + - [866, 9914.02] - - [4096, 3418, 1, 1024] - - [814, 9765.86] + - [876, 9765.86] - - [1024, 3064, 1, 4096] - - [805, 9621.68] + - [867, 9621.68] - - [4096, 3259, 1, 1024] - - [804, 9765.52] + - [866, 9765.52] - - [4096, 3308, 1, 1024] - - [803, 9900.46] + - [865, 9900.46] - - [1024, 3533, 1, 4096] - - [823, 9520.12] + - [885, 9520.12] - - [1024, 3344, 1, 4096] - - [823, 9014.55] + - [885, 9014.55] - - [1024, 4030, 1, 1024] - - [805, 9354.1] + - [867, 9354.1] - - [4096, 3459, 1, 1024] - - [804, 9656.2] + - [866, 9656.2] - - [1024, 3572, 1, 4096] - - [820, 9640.07] + - [882, 9640.07] - - [1024, 3925, 1, 1024] - - [816, 9173.74] + - [878, 9173.74] - - [4096, 3435, 1, 1024] - - [799, 9778.2] + - [861, 9778.2] - - [1024, 3956, 1, 4096] - - [806, 9498.56] + - [868, 9498.56] - - [1024, 3463, 1, 4096] - - [823, 9332.46] + - [885, 9332.46] - - [4096, 3182, 1, 1024] - - [814, 9826.84] + - [876, 9826.84] - - [4096, 3976, 1, 1024] - - [814, 9741.99] + - [876, 9741.99] - - [1024, 3417, 1, 4096] - - [823, 9208.97] + - [885, 9208.97] - - [1024, 3528, 1, 4096] - - [823, 9509.09] + - [885, 9509.09] - - [4096, 3446, 1, 1024] - - [814, 9816.97] + - [876, 9816.97] - - [64, 122, 528, 123] - - [850, 6325.98] + - [912, 6325.98] - - [1024, 3543, 1, 4096] - - [823, 9538.73] + - [885, 9538.73] - - [4096, 3287, 1, 1024] - - [803, 9846.04] + - [865, 9846.04] - - [1024, 3499, 1, 4096] - - [823, 9428.51] + - [885, 9428.51] - - [1024, 3231, 1, 4096] - - [816, 8769.91] + - [878, 8769.91] - - [64, 17, 3632, 17] - - [838, 1934.94] + - [900, 1934.94] - - [4096, 3519, 1, 1024] - - [803, 9804.38] + - [865, 9804.38] - - [4096, 3552, 1, 1024] - - [803, 9892.65] + - [865, 9892.65] - - [1024, 3458, 1, 4096] - - [823, 9312.28] + - [885, 9312.28] - - [64, 93, 688, 92] - - [842, 5660.22] + - [904, 5660.22] - - [1024, 3374, 1, 4096] - - [817, 9110.41] + - [879, 9110.41] - - [1024, 3396, 1, 4096] - - [823, 9145.79] + - [885, 9145.79] - - [1024, 2967, 1, 4096] - - [805, 9364.76] + - [867, 9364.76] - - [64, 19, 3264, 19] - - [842, 2142.47] + - [904, 2142.47] - - [4096, 3482, 1, 1024] - - [803, 9714.2] + - [865, 9714.2] - - [64, 32, 1984, 32] - - [853, 3619.91] + - [915, 3619.91] - - [64, 102, 624, 99] - - [844, 5515.33] + - [906, 5515.33] - - [1024, 3226, 1, 4096] - - [802, 8790.47] + - [864, 8790.47] - - [4096, 3377, 1, 1024] - - [800, 9684.08] + - [862, 9684.08] - - [4096, 3426, 1, 1024] - - [815, 9869.94] + - [877, 9869.94] - - [4096, 2935, 1, 1024] - - [815, 9762.11] + - [877, 9762.11] - - [64, 133, 480, 133] - - [854, 5891.32] + - [916, 5891.32] - - [1024, 3439, 1, 4096] - - [823, 9253.99] + - [885, 9253.99] - - [4096, 3267, 1, 1024] - - [803, 9783.9] + - [865, 9783.9] - - [4096, 3499, 1, 1024] - - [804, 9761.11] + - [866, 9761.11] - - [4096, 3356, 1, 1024] - - [815, 9679.44] + - [877, 9679.44] - - [64, 232, 272, 232] - - [858, 7181.03] + - [920, 7181.03] - - [64, 162, 400, 159] - - [818, 6444.63] + - [880, 6444.63] - - [4096, 3939, 1, 1024] - - [814, 9878.0] + - [876, 9878.0] - - [1024, 3526, 1, 4096] - - [823, 9508.1] + - [885, 9508.1] - - [1024, 3859, 1, 33708] - - [804, 9402.13] + - [866, 9402.13] - - [1024, 3385, 1, 4096] - - [822, 9107.28] + - [884, 9107.28] - - [1024, 3496, 1, 4096] - - [823, 9418.0] + - [885, 9418.0] - - [4096, 3141, 1, 1024] - - [815, 9682.54] + - [877, 9682.54] - - [4096, 3510, 1, 1024] - - [803, 9786.59] + - [865, 9786.59] - - [1024, 3434, 1, 4096] - - [823, 9246.7] + - [885, 9246.7] - - [4096, 3969, 1, 1024] - - [803, 9714.85] + - [865, 9714.85] - - [1024, 3121, 1, 4096] - - [801, 8464.32] + - [863, 8464.32] - - [1024, 3232, 1, 4096] - - [823, 8711.73] + - [885, 8711.73] - - [1024, 4030, 1, 33708] - - [804, 9816.31] + - [866, 9816.31] - - [1024, 3780, 1, 33708] - - [812, 9315.54] + - [874, 9315.54] - - [1024, 3969, 1, 1024] - - [801, 9248.54] + - [863, 9248.54] - - [4096, 3527, 1, 1024] - - [803, 9832.94] + - [865, 9832.94] - - [4096, 3336, 1, 1024] - - [800, 9623.35] + - [862, 9623.35] - - [4096, 3290, 1, 1024] - - [803, 9852.21] + - [865, 9852.21] - - [64, 9, 6544, 9] - - [843, 1068.24] + - [905, 1068.24] - - [1024, 3469, 1, 4096] - - [823, 9350.55] + - [885, 9350.55] - - [4096, 3490, 1, 1024] - - [803, 9737.56] + - [865, 9737.56] - - [4096, 3064, 1, 1024] - - [803, 9890.02] + - [865, 9890.02] - - [4096, 3582, 1, 1024] - - [804, 9961.38] + - [866, 9961.38] - - [1024, 3956, 1, 1024] - - [801, 9294.25] + - [863, 9294.25] - - [4096, 3417, 1, 1024] - - [799, 9811.66] + - [861, 9811.66] - - [1024, 2736, 1, 4096] - - [805, 8636.7] + - [867, 8636.7] - - [64, 78, 816, 78] - - [842, 4946.1] + - [904, 4946.1] - - [1024, 3205, 1, 4096] - - [817, 8657.21] + - [879, 8657.21] - - [1024, 3143, 1, 4096] - - [817, 8567.87] + - [879, 8567.87] - - [1024, 4020, 1, 4096] - - [805, 9664.62] + - [867, 9664.62] - - [1024, 3318, 1, 4096] - - [802, 8967.05] + - [864, 8967.05] - - [4096, 3364, 1, 1024] - - [815, 9697.18] + - [877, 9697.18] - - [1024, 3353, 1, 4096] - - [823, 9034.17] + - [885, 9034.17] - - [1024, 3464, 1, 4096] - - [823, 9326.05] + - [885, 9326.05] - - [4096, 3205, 1, 1024] - - [803, 9619.1] + - [865, 9619.1] - - [4096, 3318, 1, 1024] - - [804, 9932.66] + - [866, 9932.66] - - [1024, 3402, 1, 4096] - - [822, 9153.49] + - [884, 9153.49] - - [4096, 3181, 1, 1024] - - [814, 9789.15] + - [876, 9789.15] - - [4096, 3550, 1, 1024] - - [804, 9888.13] + - [866, 9888.13] - - [4096, 3445, 1, 1024] - - [814, 9752.65] + - [876, 9752.65] - - [1024, 3138, 1, 4096] - - [800, 8484.1] + - [862, 8484.1] - - [64, 99, 624, 99] - - [850, 5323.99] + - [912, 5323.99] - - [4096, 3079, 1, 1024] - - [800, 9562.26] + - [862, 9562.26] - - [4096, 3144, 1, 1024] - - [814, 9686.66] + - [876, 9686.66] - - [4096, 3860, 1, 1024] - - [815, 9733.42] + - [877, 9733.42] - - [1024, 3515, 1, 4096] - - [823, 9478.44] + - [885, 9478.44] - - [4096, 3408, 1, 1024] - - [800, 9764.96] + - [862, 9764.96] - - [64, 101, 624, 102] - - [850, 5482.79] + - [912, 5482.79] - - [1024, 3181, 1, 4096] - - [802, 8593.26] + - [864, 8593.26] - - [4096, 3298, 1, 1024] - - [804, 9867.72] + - [866, 9867.72] - - [4096, 3585, 1, 1024] - - [814, 9633.01] + - [876, 9633.01] - - [1024, 3550, 1, 4096] - - [823, 9564.46] + - [885, 9564.46] - - [1024, 4020, 1, 1024] - - [806, 9339.15] + - [868, 9339.15] - - [4096, 3481, 1, 1024] - - [804, 9714.0] + - [866, 9714.0] - - [4096, 3530, 1, 1024] - - [804, 9833.99] + - [866, 9833.99] - - [4096, 3425, 1, 1024] - - [800, 9675.66] + - [862, 9675.66] - - [4096, 4026, 1, 1024] - - [804, 9849.77] + - [866, 9849.77] - - [1024, 3860, 1, 1024] - - [817, 9073.59] + - [879, 9073.59] - - [4096, 3975, 1, 1024] - - [804, 9737.72] + - [866, 9737.72] - - [1024, 3286, 1, 4096] - - [801, 8884.24] + - [863, 8884.24] - - [1024, 3176, 1, 4096] - - [801, 8597.48] + - [863, 8597.48] - - [1024, 3894, 1, 4096] - - [805, 9359.13] + - [867, 9359.13] - - [4096, 3355, 1, 1024] - - [814, 9693.09] + - [876, 9693.09] - - [4096, 3404, 1, 1024] - - [814, 9786.12] + - [876, 9786.12] - - [1024, 3501, 1, 4096] - - [822, 9426.14] + - [884, 9426.14] - - [4096, 3245, 1, 1024] - - [804, 9723.57] + - [866, 9723.57] - - [1024, 3431, 1, 4096] - - [820, 9244.32] + - [882, 9244.32] - - [1024, 4000, 1, 1024] - - [816, 9344.03] + - [878, 9344.03] - - [4096, 3509, 1, 1024] - - [803, 9781.72] + - [865, 9781.72] - - [4096, 3558, 1, 1024] - - [804, 9905.15] + - [866, 9905.15] - - [1024, 3535, 1, 4096] - - [822, 9519.15] + - [884, 9519.15] - - [1024, 3414, 1, 4096] - - [820, 9198.05] + - [882, 9198.05] - - [1024, 3445, 1, 4096] - - [823, 9279.66] + - [885, 9279.66] - - [1024, 3436, 1, 4096] - - [823, 9259.7] + - [885, 9259.7] - - [4096, 3472, 1, 1024] - - [804, 9685.27] + - [866, 9685.27] - - [1024, 3211, 1, 4096] - - [802, 8708.41] + - [864, 8708.41] - - [64, 7, 8192, 7] - - [839, 802.916] + - [901, 802.916] - - [4096, 3383, 1, 1024] - - [814, 9734.82] + - [876, 9734.82] - - [4096, 3448, 1, 1024] - - [815, 9828.54] + - [877, 9828.54] - - [1024, 3343, 1, 4096] - - [816, 9010.46] + - [878, 9010.46] - - [1024, 3518, 1, 4096] - - [823, 9468.02] + - [885, 9468.02] - - [4096, 3289, 1, 1024] - - [804, 9844.16] + - [866, 9844.16] - - [1024, 3440, 1, 4096] - - [819, 9269.52] + - [881, 9269.52] - - [1024, 4032, 1, 33708] - - [803, 9822.41] + - [865, 9822.41] - - [4096, 3489, 1, 1024] - - [803, 9742.03] + - [865, 9742.03] - - [4096, 3346, 1, 1024] - - [800, 9616.74] + - [862, 9616.74] - - [1024, 3534, 1, 4096] - - [822, 9524.29] + - [884, 9524.29] - - [1024, 3079, 1, 4096] - - [817, 8397.77] + - [879, 8397.77] - - [1024, 3955, 1, 4096] - - [804, 9492.25] + - [866, 9492.25] - - [4096, 3236, 1, 1024] - - [804, 9706.03] + - [866, 9706.03] - - [1024, 3545, 1, 4096] - - [822, 9551.97] + - [884, 9551.97] - - [1024, 3144, 1, 4096] - - [816, 8556.8] + - [878, 8556.8] - - [4096, 3780, 1, 1024] - - [803, 9847.6] + - [865, 9847.6] - - [4096, 3163, 1, 1024] - - [814, 9717.79] + - [876, 9717.79] - - [4096, 3468, 1, 1024] - - [804, 9686.49] + - [866, 9686.49] - - [1024, 3539, 1, 4096] - - [823, 9526.99] + - [885, 9526.99] - - [1024, 3541, 1, 4096] - - [823, 9532.86] + - [885, 9532.86] - - [4096, 3363, 1, 1024] - - [799, 9699.1] + - [861, 9699.1] - - [1024, 3475, 1, 4096] - - [823, 9357.1] + - [885, 9357.1] - - [4096, 3110, 1, 1024] - - [815, 9659.68] + - [877, 9659.68] - - [1024, 3509, 1, 4096] - - [822, 9450.59] + - [884, 9450.59] - - [1024, 3413, 1, 4096] - - [823, 9185.91] + - [885, 9185.91] - - [1024, 3975, 1, 1024] - - [801, 9315.52] + - [863, 9315.52] - - [4096, 3549, 1, 1024] - - [804, 9884.82] + - [866, 9884.82] - - [4096, 3342, 1, 1024] - - [814, 9644.37] + - [876, 9644.37] - - [1024, 2985, 1, 4096] - - [804, 9392.17] + - [866, 9392.17] - - [1024, 3876, 1, 33708] - - [803, 9442.32] + - [865, 9442.32] - - [4096, 3280, 1, 1024] - - [803, 9820.02] + - [865, 9820.02] - - [4096, 3191, 1, 1024] - - [815, 9862.18] + - [877, 9862.18] - - [4096, 3512, 1, 1024] - - [804, 9793.21] + - [866, 9793.21] - - [1024, 3560, 1, 4096] - - [820, 9555.55] + - [882, 9555.55] - - [4096, 2499, 1, 1024] - - [804, 9669.45] + - [866, 9669.45] - - [1024, 3248, 1, 4096] - - [801, 8811.94] + - [863, 8811.94] - - [4096, 3423, 1, 1024] - - [815, 9729.77] + - [877, 9729.77] - - [64, 111, 576, 111] - - [850, 5982.73] + - [912, 5982.73] - - [4096, 3297, 1, 1024] - - [803, 9865.29] + - [865, 9865.29] - - [4096, 3154, 1, 1024] - - [815, 9613.52] + - [877, 9613.52] - - [1024, 3303, 1, 4096] - - [802, 8951.89] + - [864, 8951.89] - - [1024, 3222, 1, 4096] - - [822, 8682.99] + - [884, 8682.99] - - [1024, 3978, 1, 1024] - - [806, 9235.03] + - [868, 9235.03] - - [4096, 3529, 1, 1024] - - [804, 9831.72] + - [866, 9831.72] - - [4096, 3386, 1, 1024] - - [814, 9755.77] + - [876, 9755.77] - - [64, 134, 480, 134] - - [829, 5990.63] + - [891, 5990.63] - - [1024, 3451, 1, 4096] - - [820, 9277.71] + - [882, 9277.71] - - [4096, 3562, 1, 1024] - - [804, 9908.92] + - [866, 9908.92] - - [4096, 3276, 1, 1024] - - [803, 9818.14] + - [865, 9818.14] - - [64, 135, 480, 132] - - [858, 6071.87] + - [920, 6071.87] - - [1024, 3894, 1, 33708] - - [803, 9487.89] + - [865, 9487.89] - - [64, 134, 480, 132] - - [857, 6091.75] + - [919, 6091.75] - - [4096, 3540, 1, 1024] - - [804, 9862.89] + - [866, 9862.89] - - [1024, 3416, 1, 4096] - - [822, 9206.27] + - [884, 9206.27] - - [1024, 4005, 1, 33708] - - [803, 9757.29] + - [865, 9757.29] - - [1024, 3942, 1, 4096] - - [806, 9455.85] + - [868, 9455.85] - - [4096, 3403, 1, 1024] - - [814, 9739.46] + - [876, 9739.46] - - [4096, 3381, 1, 1024] - - [815, 9760.14] + - [877, 9760.14] - - [1024, 3492, 1, 4096] - - [819, 9391.79] + - [881, 9391.79] - - [4096, 3101, 1, 1024] - - [815, 9626.02] + - [877, 9626.02] - - [1024, 3430, 1, 4096] - - [823, 9232.14] + - [885, 9232.14] - - [1024, 3977, 1, 4096] - - [806, 9563.0] + - [868, 9563.0] - - [1024, 3640, 1, 4096] - - [805, 8761.5] + - [867, 8761.5] - - [4096, 3557, 1, 1024] - - [804, 9905.52] + - [866, 9905.52] - - [4096, 3414, 1, 1024] - - [800, 9755.49] + - [862, 9755.49] - - [1024, 3391, 1, 4096] - - [823, 9142.66] + - [885, 9142.66] - - [64, 134, 480, 135] - - [832, 5922.15] + - [894, 5922.15] - - [64, 16, 3840, 16] - - [848, 2080.61] + - [910, 2080.61] - - [1024, 3356, 1, 4096] - - [823, 9051.09] + - [885, 9051.09] - - [4096, 3320, 1, 1024] - - [804, 9929.57] + - [866, 9929.57] - - [4096, 2765, 1, 1024] - - [804, 9750.28] + - [866, 9750.28] - - [64, 162, 400, 162] - - [821, 6515.29] + - [883, 6515.29] - - [1024, 3411, 1, 4096] - - [823, 9185.72] + - [885, 9185.72] - - [1024, 3978, 1, 4096] - - [803, 9562.77] + - [865, 9562.77] - - [4096, 3487, 1, 1024] - - [804, 9733.85] + - [866, 9733.85] - - [4096, 3520, 1, 1024] - - [803, 9813.95] + - [865, 9813.95] - - [4096, 3942, 1, 1024] - - [814, 9804.39] + - [876, 9804.39] - - [4096, 3431, 1, 1024] - - [799, 9819.06] + - [861, 9819.06] - - [1024, 3271, 1, 4096] - - [816, 8913.08] + - [878, 8913.08] - - [4096, 4020, 1, 1024] - - [803, 9831.42] + - [865, 9831.42] - - [1024, 3481, 1, 4096] - - [819, 9376.15] + - [881, 9376.15] - - [1024, 3419, 1, 4096] - - [822, 9208.68] + - [884, 9208.68] - - [1024, 4059, 1, 4096] - - [806, 9733.83] + - [868, 9733.83] - - [4096, 3345, 1, 1024] - - [815, 9651.43] + - [877, 9651.43] - - [4096, 3394, 1, 1024] - - [815, 9780.43] + - [877, 9780.43] - - [1024, 3298, 1, 4096] - - [822, 8889.63] + - [884, 8889.63] - - [4096, 3235, 1, 1024] - - [804, 9705.81] + - [866, 9705.81] - - [1024, 3681, 1, 33708] - - [811, 9146.22] + - [873, 9146.22] - - [1024, 3840, 1, 4096] - - [804, 9253.95] + - [866, 9253.95] - - [1024, 3362, 1, 4096] - - [823, 9059.81] + - [885, 9059.81] - - [4096, 3467, 1, 1024] - - [803, 9677.51] + - [865, 9677.51] - - [1024, 3349, 1, 4096] - - [823, 9034.07] + - [885, 9034.07] - - [1024, 3460, 1, 4096] - - [823, 9322.94] + - [885, 9322.94] - - [4096, 3214, 1, 1024] - - [804, 9644.46] + - [866, 9644.46] - - [1024, 3398, 1, 4096] - - [823, 9157.29] + - [885, 9157.29] - - [4096, 3478, 1, 1024] - - [803, 9706.66] + - [865, 9706.66] - - [1024, 4050, 1, 33708] - - [803, 9865.14] + - [865, 9865.14] - - [1024, 3244, 1, 4096] - - [819, 8744.53] + - [881, 8744.53] - - [4096, 3341, 1, 1024] - - [815, 9646.79] + - [877, 9646.79] - - [4096, 3454, 1, 1024] - - [800, 9880.56] + - [862, 9880.56] - - [1024, 3166, 1, 4096] - - [817, 8618.46] + - [879, 8618.46] - - [1024, 3425, 1, 4096] - - [823, 9225.32] + - [885, 9225.32] - - [4096, 3295, 1, 1024] - - [804, 9863.81] + - [866, 9863.81] - - [4096, 3072, 1, 1024] - - [803, 9971.09] + - [865, 9971.09] - - [4096, 3822, 1, 1024] - - [804, 9952.07] + - [866, 9952.07] - - [1024, 3681, 1, 4096] - - [805, 8856.94] + - [867, 8856.94] - - [1024, 4050, 1, 4096] - - [805, 9717.58] + - [867, 9717.58] - - [4096, 3495, 1, 1024] - - [803, 9741.14] + - [865, 9741.14] - - [4096, 3560, 1, 1024] - - [804, 9909.14] + - [866, 9909.14] - - [1024, 3524, 1, 4096] - - [822, 9503.2] + - [884, 9503.2] - - [1024, 3942, 1, 33708] - - [803, 9602.67] + - [865, 9602.67] - - [1024, 3304, 1, 4096] - - [802, 8928.76] + - [864, 8928.76] - - [1024, 3387, 1, 4096] - - [823, 9127.65] + - [885, 9127.65] - - [1024, 3498, 1, 4096] - - [822, 9423.39] + - [884, 9423.39] - - [4096, 3458, 1, 1024] - - [803, 9642.63] + - [865, 9642.63] - - [4096, 2967, 1, 1024] - - [803, 9626.71] + - [865, 9626.71] - - [64, 8, 7280, 8] - - [825, 1032.61] + - [887, 1032.61] - - [4096, 3385, 1, 1024] - - [799, 9735.77] + - [861, 9735.77] - - [4096, 3434, 1, 1024] - - [814, 9808.9] + - [876, 9808.9] - - [1024, 3519, 1, 4096] - - [823, 9484.83] + - [885, 9484.83] - - [1024, 3511, 1, 4096] - - [823, 9456.47] + - [885, 9456.47] - - [1024, 3288, 1, 4096] - - [822, 8864.05] + - [884, 8864.05] - - [1024, 2918, 1, 4096] - - [805, 9170.35] + - [867, 9170.35] - - [4096, 3573, 1, 1024] - - [804, 9945.85] + - [866, 9945.85] - - [1024, 3822, 1, 33708] - - [813, 9331.0] + - [875, 9331.0] - - [64, 102, 624, 102] - - [850, 5531.17] + - [912, 5531.17] - - [4096, 3539, 1, 1024] - - [804, 9855.39] + - [866, 9855.39] - - [4096, 3332, 1, 1024] - - [815, 9648.97] + - [877, 9648.97] - - [4096, 3286, 1, 1024] - - [804, 9846.42] + - [866, 9846.42] - - [1024, 4026, 1, 4096] - - [805, 9675.94] + - [867, 9675.94] - - [1024, 3277, 1, 4096] - - [819, 8836.21] + - [881, 8836.21] - - [1024, 3471, 1, 4096] - - [823, 9346.33] + - [885, 9346.33] - - [4096, 3518, 1, 1024] - - [804, 9804.2] + - [866, 9804.2] - - [1024, 3393, 1, 4096] - - [823, 9148.99] + - [885, 9148.99] - - [4096, 3413, 1, 1024] - - [800, 9785.17] + - [862, 9785.17] - - [4096, 3303, 1, 1024] - - [804, 9884.37] + - [866, 9884.37] - - [1024, 3207, 1, 4096] - - [801, 8714.69] + - [863, 8714.69] - - [1024, 3894, 1, 1024] - - [817, 9181.51] + - [879, 9181.51] - - [1024, 3977, 1, 1024] - - [817, 9240.9] + - [879, 9240.9] - - [64, 135, 480, 133] - - [832, 5923.4] + - [894, 5923.4] - - [4096, 3535, 1, 1024] - - [804, 9839.55] + - [866, 9839.55] - - [4096, 3376, 1, 1024] - - [799, 9712.02] + - [861, 9712.02] - - [1024, 3355, 1, 4096] - - [823, 9043.27] + - [885, 9043.27] - - [64, 27, 2336, 27] - - [851, 2929.9] + - [913, 2929.9] - - [1024, 3466, 1, 4096] - - [823, 9339.1] + - [885, 9339.1] - - [4096, 3266, 1, 1024] - - [804, 9789.29] + - [866, 9789.29] - - [1024, 3404, 1, 4096] - - [823, 9176.76] + - [885, 9176.76] - - [1024, 3999, 1, 1024] - - [816, 9391.91] + - [878, 9391.91] - - [64, 148, 432, 143] - - [829, 6182.92] + - [891, 6182.92] - - [4096, 3498, 1, 1024] - - [803, 9764.56] + - [865, 9764.56] - - [1024, 4032, 1, 1024] - - [801, 9402.03] + - [863, 9402.03] - - [1024, 3410, 1, 4096] - - [822, 9183.5] + - [884, 9183.5] - - [4096, 3393, 1, 1024] - - [815, 9695.49] + - [877, 9695.49] - - [1024, 3140, 1, 4096] - - [816, 8504.86] + - [878, 8504.86] - - [1024, 3910, 1, 33708] - - [803, 9526.06] + - [865, 9526.06] - - [1024, 3334, 1, 4096] - - [822, 8987.59] + - [884, 8987.59] - - [4096, 3140, 1, 1024] - - [815, 9660.71] + - [877, 9660.71] - - [1024, 4005, 1, 4096] - - [806, 9629.88] + - [868, 9629.88] - - [1024, 3579, 1, 4096] - - [822, 9661.45] + - [884, 9661.45] - - [4096, 3372, 1, 1024] - - [815, 9697.32] + - [877, 9697.32] - - [1024, 3245, 1, 4096] - - [816, 8847.76] + - [878, 8847.76] - - [64, 38, 1680, 38] - - [826, 3340.44] + - [888, 3340.44] - - [4096, 3956, 1, 1024] - - [815, 9911.15] + - [877, 9911.15] - - [4096, 3213, 1, 1024] - - [803, 9643.11] + - [865, 9643.11] - - [1024, 3361, 1, 4096] - - [823, 9062.24] + - [885, 9062.24] - - [1024, 3536, 1, 4096] - - [822, 9530.65] + - [884, 9530.65] - - [1024, 3968, 1, 1024] - - [817, 9377.92] + - [879, 9377.92] - - [4096, 3477, 1, 1024] - - [804, 9700.77] + - [866, 9700.77] - - [4096, 3526, 1, 1024] - - [804, 9824.41] + - [866, 9824.41] - - [1024, 4005, 1, 1024] - - [801, 9362.39] + - [863, 9362.39] - - [1024, 3530, 1, 4096] - - [820, 9487.17] + - [882, 9487.17] - - [1024, 3944, 1, 4096] - - [805, 9464.55] + - [867, 9464.55] - - [4096, 3453, 1, 1024] - - [814, 9826.77] + - [876, 9826.77] - - [4096, 3184, 1, 1024] - - [815, 9833.59] + - [877, 9833.59] - - [4096, 3579, 1, 1024] - - [804, 9962.55] + - [866, 9962.55] - - [4096, 3351, 1, 1024] - - [815, 9653.34] + - [877, 9653.34] - - [4096, 3416, 1, 1024] - - [799, 9810.4] + - [861, 9810.4] - - [64, 100, 624, 100] - - [850, 5408.55] + - [912, 5408.55] - - [1024, 3822, 1, 4096] - - [805, 9196.2] + - [867, 9196.2] - - [1024, 3796, 1, 4096] - - [805, 9131.96] + - [867, 9131.96] - - [4096, 3257, 1, 1024] - - [803, 9767.34] + - [865, 9767.34] - - [4096, 3306, 1, 1024] - - [803, 9893.35] + - [865, 9893.35] - - [1024, 3505, 1, 4096] - - [823, 9450.02] + - [885, 9450.02] - - [1024, 3315, 1, 4096] - - [816, 8979.77] + - [878, 8979.77] - - [1024, 3486, 1, 4096] - - [822, 9393.48] + - [884, 9393.48] - - [4096, 3457, 1, 1024] - - [803, 9653.19] + - [865, 9653.19] - - [4096, 3870, 1, 1024] - - [800, 9717.51] + - [862, 9717.51] - - [1024, 3447, 1, 4096] - - [823, 9273.14] + - [885, 9273.14] - - [1024, 3558, 1, 4096] - - [820, 9567.33] + - [882, 9567.33] - - [4096, 3433, 1, 1024] - - [800, 9759.26] + - [862, 9759.26] - - [4096, 3180, 1, 1024] - - [815, 9738.63] + - [877, 9738.63] - - [1024, 3213, 1, 4096] - - [801, 8692.25] + - [863, 8692.25] - - [1024, 3900, 1, 4096] - - [805, 9388.61] + - [867, 9388.61] - - [4096, 3444, 1, 1024] - - [814, 9869.73] + - [876, 9869.73] - - [1024, 3504, 1, 4096] - - [823, 9429.38] + - [885, 9429.38] - - [4096, 4059, 1, 1024] - - [804, 9920.79] + - [866, 9920.79] - - [1024, 3442, 1, 4096] - - [823, 9273.01] + - [885, 9273.01] - - [4096, 3517, 1, 1024] - - [803, 9808.19] + - [865, 9808.19] - - [1024, 3566, 1, 4096] - - [822, 9622.89] + - [884, 9622.89] - - [4096, 3248, 1, 1024] - - [803, 9730.33] + - [865, 9730.33] - - [1024, 3547, 1, 4096] - - [822, 9564.73] + - [884, 9564.73] - - [64, 59, 1088, 59] - - [841, 4611.76] + - [903, 4611.76] - - [1024, 3340, 1, 4096] - - [822, 8992.21] + - [884, 8992.21] - - [4096, 3480, 1, 1024] - - [804, 9710.17] + - [866, 9710.17] - - [1024, 3968, 1, 4096] - - [804, 9543.11] + - [866, 9543.11] - - [4096, 3424, 1, 1024] - - [800, 9808.66] + - [862, 9808.66] - - [1024, 3906, 1, 1024] - - [802, 9150.54] + - [864, 9150.54] - - [4096, 3265, 1, 1024] - - [803, 9786.85] + - [865, 9786.85] - - [1024, 3384, 1, 4096] - - [823, 9119.56] + - [885, 9119.56] - - [1024, 3494, 1, 4096] - - [820, 9415.52] + - [882, 9415.52] - - [1024, 3236, 1, 4096] - - [817, 8767.14] + - [879, 8767.14] - - [4096, 3497, 1, 1024] - - [804, 9750.86] + - [866, 9750.86] - - [4096, 3354, 1, 1024] - - [815, 9665.17] + - [877, 9665.17] - - [4096, 3055, 1, 1024] - - [804, 9884.09] + - [866, 9884.09] - - [64, 11, 5456, 11] - - [827, 1368.34] + - [889, 1368.34] - - [4096, 3244, 1, 1024] - - [803, 9720.02] + - [865, 9720.02] - - [4096, 3139, 1, 1024] - - [814, 9737.06] + - [876, 9737.06] - - [4096, 3508, 1, 1024] - - [803, 9771.66] + - [865, 9771.66] - - [4096, 4050, 1, 1024] - - [803, 9898.79] + - [865, 9898.79] - - [1024, 3472, 1, 4096] - - [822, 9353.83] + - [884, 9353.83] - - [1024, 3861, 1, 1024] - - [801, 9061.32] + - [863, 9061.32] - - [1024, 3910, 1, 1024] - - [805, 9043.54] + - [867, 9043.54] - - [4096, 3371, 1, 1024] - - [815, 9738.24] + - [877, 9738.24] - - [64, 65, 992, 65] - - [854, 4354.59] + - [916, 4354.59] - - [1024, 3751, 1, 4096] - - [804, 9018.74] + - [866, 9018.74] - - [4096, 3325, 1, 1024] - - [803, 9958.73] + - [865, 9958.73] - - [1024, 3321, 1, 4096] - - [823, 8952.55] + - [885, 8952.55] - - [1024, 3944, 1, 1024] - - [802, 9117.35] + - [864, 9117.35] - - [4096, 3525, 1, 1024] - - [804, 9822.14] + - [866, 9822.14] - - [4096, 3382, 1, 1024] - - [815, 9720.21] + - [877, 9720.21] - - [64, 122, 528, 122] - - [850, 6389.33] + - [912, 6389.33] - - [1024, 3453, 1, 4096] - - [820, 9305.03] + - [882, 9305.03] - - [4096, 3564, 1, 1024] - - [803, 9911.32] + - [865, 9911.32] - - [4096, 3288, 1, 1024] - - [803, 9841.17] + - [865, 9841.17] - - [1024, 3925, 1, 4096] - - [804, 9418.95] + - [866, 9418.95] - - [1024, 3057, 1, 4096] - - [805, 9590.51] + - [867, 9590.51] - - [4096, 3488, 1, 1024] - - [804, 9732.5] + - [866, 9732.5] - - [4096, 3046, 1, 1024] - - [804, 9850.72] + - [866, 9850.72] - - [1024, 3189, 1, 4096] - - [816, 8677.02] + - [878, 8677.02] - - [4096, 3399, 1, 1024] - - [800, 9673.09] + - [862, 9673.09] - - [1024, 3383, 1, 4096] - - [823, 9102.37] + - [885, 9102.37] - - [1024, 3415, 1, 4096] - - [823, 9216.37] + - [885, 9216.37] - - [1024, 3388, 1, 4096] - - [823, 9127.53] + - [885, 9127.53] - - [1024, 3376, 1, 4096] - - [820, 9090.53] + - [882, 9090.53] - - [1024, 3473, 1, 4096] - - [823, 9354.12] + - [885, 9354.12] - - [4096, 3162, 1, 1024] - - [799, 9694.83] + - [861, 9694.83] - - [1024, 3448, 1, 4096] - - [823, 9283.45] + - [885, 9283.45] - - [4096, 3362, 1, 1024] - - [815, 9673.33] + - [877, 9673.33] - - [64, 228, 272, 228] - - [808, 7039.13] + - [870, 7039.13] - - [1024, 3262, 1, 4096] - - [817, 8850.84] + - [879, 8850.84] - - [1024, 3184, 1, 4096] - - [802, 8625.37] + - [864, 8625.37] - - [1024, 3378, 1, 4096] - - [822, 9105.27] + - [884, 9105.27] - - [4096, 3548, 1, 1024] - - [803, 9877.83] + - [865, 9877.83] - - [4096, 2977, 1, 1024] - - [803, 9647.81] + - [865, 9647.81] - - [64, 21, 2976, 21] - - [838, 2364.81] + - [900, 2364.81] - - [64, 112, 576, 111] - - [837, 5973.68] + - [899, 5973.68] - - [4096, 3443, 1, 1024] - - [799, 9784.5] + - [861, 9784.5] - - [1024, 3289, 1, 4096] - - [823, 8874.04] + - [885, 8874.04] - - [1024, 3483, 1, 4096] - - [819, 9380.57] + - [881, 9380.57] - - [4096, 3190, 1, 1024] - - [815, 9850.96] + - [877, 9850.96] - - [1024, 3421, 1, 4096] - - [823, 9214.06] + - [885, 9214.06] - - [1024, 3514, 1, 4096] - - [822, 9458.23] + - [884, 9458.23] - - [1024, 3532, 1, 4096] - - [823, 9513.03] + - [885, 9513.03] - - [1024, 3565, 1, 4096] - - [822, 9630.6] + - [884, 9630.6] - - [4096, 3422, 1, 1024] - - [800, 9733.79] + - [862, 9733.79] - - [4096, 3263, 1, 1024] - - [804, 9776.94] + - [866, 9776.94] - - [4096, 3296, 1, 1024] - - [804, 9860.61] + - [866, 9860.61] - - [4096, 3640, 1, 1024] - - [814, 9782.3] + - [876, 9782.3] - - [4096, 3463, 1, 1024] - - [803, 9672.0] + - [865, 9672.0] - - [4096, 3528, 1, 1024] - - [804, 9829.98] + - [866, 9829.98] - - [1024, 3351, 1, 4096] - - [817, 9054.37] + - [879, 9054.37] - - [1024, 3462, 1, 4096] - - [823, 9327.85] + - [885, 9327.85] - - [4096, 3226, 1, 1024] - - [804, 9674.93] + - [866, 9674.93] - - [4096, 3439, 1, 1024] - - [799, 9823.18] + - [861, 9823.18] - - [4096, 3121, 1, 1024] - - [799, 9672.64] + - [861, 9672.64] - - [1024, 4059, 1, 33708] - - [803, 9885.72] + - [865, 9885.72] - - [1024, 3311, 1, 4096] - - [823, 8910.01] + - [885, 8910.01] - - [1024, 3230, 1, 4096] - - [823, 8705.9] + - [885, 8705.9] - - [4096, 3353, 1, 1024] - - [815, 9671.86] + - [877, 9671.86] - - [4096, 3402, 1, 1024] - - [800, 9727.04] + - [862, 9727.04] - - [1024, 3427, 1, 4096] - - [823, 9233.55] + - [885, 9233.55] - - [1024, 3346, 1, 4096] - - [823, 9015.77] + - [885, 9015.77] - - [1024, 3126, 1, 4096] - - [817, 8519.31] + - [879, 8519.31] - - [1024, 3796, 1, 1024] - - [801, 8916.75] + - [863, 8916.75] - - [1024, 3990, 1, 4096] - - [805, 9600.86] + - [867, 9600.86] - - [1024, 3257, 1, 4096] - - [801, 8790.42] + - [863, 8790.42] - - [4096, 3996, 1, 1024] - - [804, 9788.25] + - [866, 9788.25] - - [64, 143, 432, 143] - - [832, 6087.24] + - [894, 6087.24] - - [1024, 3306, 1, 4096] - - [816, 9035.69] + - [878, 9035.69] - - [1024, 3389, 1, 4096] - - [823, 9134.92] + - [885, 9134.92] - - [1024, 3500, 1, 4096] - - [823, 9443.33] + - [885, 9443.33] - - [1024, 3999, 1, 33708] - - [804, 9741.24] + - [866, 9741.24] - - [4096, 3486, 1, 1024] - - [804, 9719.67] + - [866, 9719.67] - - [1024, 3438, 1, 4096] - - [823, 9259.38] + - [885, 9259.38] - - [4096, 3616, 1, 1024] - - [814, 9739.77] + - [876, 9739.77] - - [1024, 3955, 1, 1024] - - [816, 9260.37] + - [878, 9260.37] - - [4096, 3430, 1, 1024] - - [815, 9819.95] + - [877, 9819.95] - - [4096, 3271, 1, 1024] - - [804, 9802.04] + - [866, 9802.04] - - [1024, 3364, 1, 4096] - - [816, 9144.63] + - [878, 9144.63] - - [64, 54, 1184, 54] - - [836, 4315.78] + - [898, 4315.78] - - [1024, 3497, 1, 4096] - - [823, 9429.42] + - [885, 9429.42] - - [4096, 3503, 1, 1024] - - [803, 9764.48] + - [865, 9764.48] - - [4096, 3344, 1, 1024] - - [800, 9614.16] + - [862, 9614.16] - - [1024, 3457, 1, 4096] - - [823, 9320.6] + - [885, 9320.6] - - [4096, 3466, 1, 1024] - - [803, 9677.81] + - [865, 9677.81] - - [1024, 3976, 1, 33708] - - [804, 9685.38] + - [866, 9685.38] - - [1024, 3395, 1, 4096] - - [822, 9146.39] + - [884, 9146.39] - - [4096, 3361, 1, 1024] - - [814, 9677.89] + - [876, 9677.89] - - [1024, 3751, 1, 33708] - - [812, 9234.69] + - [874, 9234.69] - - [1024, 3822, 1, 1024] - - [801, 8977.83] + - [863, 8977.83] - - [4096, 3315, 1, 1024] - - [804, 9922.54] + - [866, 9922.54] - - [1024, 3163, 1, 4096] - - [816, 8577.79] + - [878, 8577.79] - - [4096, 3547, 1, 1024] - - [804, 9882.92] + - [866, 9882.92] - - [4096, 3340, 1, 1024] - - [814, 9635.42] + - [876, 9635.42] - - [1024, 3296, 1, 4096] - - [823, 8874.66] + - [885, 8874.66] - - [1024, 3468, 1, 4096] - - [823, 9350.26] + - [885, 9350.26] - - [4096, 3294, 1, 1024] - - [803, 9856.87] + - [865, 9856.87] - - [1024, 3406, 1, 4096] - - [819, 9162.84] + - [881, 9162.84] - - [1024, 3860, 1, 33708] - - [803, 9403.56] + - [865, 9403.56] - - [1024, 3584, 1, 4096] - - [820, 9677.44] + - [882, 9677.44] - - [4096, 3189, 1, 1024] - - [815, 9820.69] + - [877, 9820.69] - - [4096, 3494, 1, 1024] - - [803, 9747.68] + - [865, 9747.68] - - [64, 135, 480, 135] - - [829, 5966.34] + - [891, 5966.34] - - [1024, 3093, 1, 4096] - - [817, 8446.06] + - [879, 8446.06] - - [4096, 3421, 1, 1024] - - [800, 9776.03] + - [862, 9776.03] - - [1024, 3479, 1, 4096] - - [823, 9376.54] + - [885, 9376.54] - - [1024, 3433, 1, 4096] - - [823, 9251.14] + - [885, 9251.14] - - [4096, 3311, 1, 1024] - - [803, 9901.53] + - [865, 9901.53] - - [1024, 3381, 1, 4096] - - [823, 9103.99] + - [885, 9103.99] - - [1024, 3996, 1, 4096] - - [804, 9609.56] + - [866, 9609.56] - - [4096, 3384, 1, 1024] - - [814, 9750.01] + - [876, 9750.01] - - [1024, 3247, 1, 4096] - - [802, 8872.59] + - [864, 8872.59] - - [1024, 3169, 1, 4096] - - [801, 8597.61] + - [863, 8597.61] - - [1024, 3088, 1, 4096] - - [817, 8410.07] + - [879, 8410.07] - - [1024, 3363, 1, 4096] - - [823, 9069.5] + - [885, 9069.5] - - [1024, 3538, 1, 4096] - - [822, 9529.68] + - [884, 9529.68] - - [1024, 3996, 1, 1024] - - [806, 9323.06] + - [868, 9323.06] - - [4096, 3169, 1, 1024] - - [800, 9821.4] + - [862, 9821.4] - - [4096, 3538, 1, 1024] - - [803, 9859.42] + - [865, 9859.42] - - [4096, 3401, 1, 1024] - - [800, 9754.5] + - [862, 9754.5] - - [4096, 3581, 1, 1024] - - [803, 9960.71] + - [865, 9960.71] - - [1024, 3180, 1, 4096] - - [801, 8635.05] + - [863, 8635.05] - - [1024, 3870, 1, 1024] - - [802, 9085.69] + - [864, 9085.69] - - [4096, 3555, 1, 1024] - - [803, 9905.74] + - [865, 9905.74] - - [4096, 3412, 1, 1024] - - [815, 9778.56] + - [877, 9778.56] - - [4096, 3302, 1, 1024] - - [803, 9888.71] + - [865, 9888.71] - - [1024, 3561, 1, 4096] - - [819, 9597.05] + - [881, 9597.05] - - [1024, 3302, 1, 4096] - - [823, 8900.87] + - [885, 8900.87] - - [1024, 3976, 1, 4096] - - [805, 9563.22] + - [867, 9563.22] - - [4096, 3485, 1, 1024] - - [803, 9722.57] + - [865, 9722.57] - - [4096, 3534, 1, 1024] - - [803, 9847.22] + - [865, 9847.22] - - [1024, 3110, 1, 4096] - - [816, 8458.56] + - [878, 8458.56] - - [1024, 3401, 1, 4096] - - [823, 9174.81] + - [885, 9174.81] - - [4096, 3216, 1, 1024] - - [803, 9645.49] + - [865, 9645.49] - - [1024, 4020, 1, 33708] - - [803, 9793.61] + - [865, 9793.61] - - [1024, 3215, 1, 4096] - - [823, 8677.51] + - [885, 8677.51] - - [4096, 3566, 1, 1024] - - [803, 9924.78] + - [865, 9924.78] - - [1024, 3137, 1, 4096] - - [801, 8547.07] + - [863, 8547.07] - - [4096, 3359, 1, 1024] - - [800, 9673.73] + - [862, 9673.73] - - [4096, 3392, 1, 1024] - - [815, 9757.51] + - [877, 9757.51] - - [1024, 3506, 1, 4096] - - [823, 9443.0] + - [885, 9443.0] - - [4096, 3233, 1, 1024] - - [803, 9698.7] + - [865, 9698.7] - - [1024, 3444, 1, 4096] - - [823, 9275.54] + - [885, 9275.54] - - [1024, 3975, 1, 4096] - - [804, 9556.87] + - [866, 9556.87] - - [1024, 3870, 1, 33708] - - [803, 9427.44] + - [865, 9427.44] - - [4096, 3465, 1, 1024] - - [804, 9675.01] + - [866, 9675.01] - - [4096, 3968, 1, 1024] - - [800, 9927.93] + - [862, 9927.93] - - [1024, 3523, 1, 4096] - - [823, 9494.15] + - [885, 9494.15] - - [64, 10, 5952, 10] - - [827, 1224.16] + - [889, 1224.16] - - [4096, 3990, 1, 1024] - - [803, 9771.27] + - [865, 9771.27] - - [1024, 3549, 1, 4096] - - [822, 9553.42] + - [884, 9553.42] - - [1024, 3342, 1, 4096] - - [823, 9007.31] + - [885, 9007.31] - - [4096, 3476, 1, 1024] - - [803, 9703.66] + - [865, 9703.66] - - [64, 232, 272, 228] - - [809, 7078.93] + - [871, 7078.93] - - [1024, 3418, 1, 4096] - - [823, 9213.09] + - [885, 9213.09] - - [1024, 3859, 1, 1024] - - [802, 9087.54] + - [864, 9087.54] - - [4096, 3339, 1, 1024] - - [815, 9594.0] + - [877, 9594.0] - - [4096, 3452, 1, 1024] - - [800, 9872.69] + - [862, 9872.69] - - [4096, 3293, 1, 1024] - - [803, 9842.65] + - [865, 9842.65] - - [4096, 3840, 1, 1024] - - [804, 10030.8] + - [866, 10030.8] - - [1024, 3369, 1, 4096] - - [801, 9099.72] + - [863, 9099.72] - - [64, 193, 320, 193] - - [831, 6425.8] + - [893, 6425.8] - - [1024, 3544, 1, 4096] - - [820, 9556.64] + - [882, 9556.64] - - [4096, 3493, 1, 1024] - - [804, 9743.34] + - [866, 9743.34] - - [4096, 3350, 1, 1024] - - [815, 9653.11] + - [877, 9653.11] - - [64, 71, 896, 71] - - [855, 4686.73] + - [917, 4686.73] - - [4096, 3256, 1, 1024] - - [803, 9763.78] + - [865, 9763.78] - - [1024, 3870, 1, 4096] - - [805, 9305.28] + - [867, 9305.28] - - [4096, 4012, 1, 1024] - - [804, 9817.35] + - [866, 9817.35] - - [1024, 3280, 1, 4096] - - [823, 8842.02] + - [885, 8842.02] - - [4096, 3456, 1, 1024] - - [799, 9874.43] + - [861, 9874.43] - - [1024, 3555, 1, 4096] - - [822, 9599.63] + - [884, 9599.63] - - [4096, 3014, 1, 1024] - - [803, 9762.28] + - [865, 9762.28] - - [1024, 3474, 1, 4096] - - [823, 9373.67] + - [885, 9373.67] - - [4096, 3367, 1, 1024] - - [799, 9694.64] + - [861, 9694.64] - - [4096, 3432, 1, 1024] - - [815, 9855.27] + - [877, 9855.27] - - [64, 84, 752, 84] - - [842, 5247.18] + - [904, 5247.18] - - [4096, 3273, 1, 1024] - - [804, 9801.87] + - [866, 9801.87] - - [4096, 3130, 1, 1024] - - [800, 9672.52] + - [862, 9672.52] - - [1024, 2984, 1, 4096] - - [805, 9403.7] + - [867, 9403.7] - - [1024, 3995, 1, 1024] - - [817, 9392.61] + - [879, 9392.61] - - [1024, 3517, 1, 4096] - - [823, 9481.39] + - [885, 9481.39] - - [1024, 3455, 1, 4096] - - [823, 9302.29] + - [885, 9302.29] - - [1024, 3939, 1, 4096] - - [805, 9469.89] + - [867, 9469.89] - - [64, 49, 1296, 49] - - [835, 3938.96] + - [897, 3938.96] - - [64, 14, 4368, 14] - - [827, 1802.47] + - [889, 1802.47] - - [64, 25, 2512, 25] - - [846, 2760.54] + - [908, 2760.54] - - [4096, 3147, 1, 1024] - - [815, 9713.03] + - [877, 9713.03] - - [4096, 3516, 1, 1024] - - [803, 9805.93] + - [865, 9805.93] - - [1024, 3876, 1, 4096] - - [805, 9320.56] + - [867, 9320.56] - - [1024, 3191, 1, 4096] - - [802, 8640.76] + - [864, 8640.76] - - [4096, 3411, 1, 1024] - - [814, 9737.37] + - [876, 9737.37] - - [1024, 3337, 1, 4096] - - [823, 8990.13] + - [885, 8990.13] - - [1024, 3512, 1, 4096] - - [823, 9459.65] + - [885, 9459.65] - - [4096, 3301, 1, 1024] - - [803, 9877.26] + - [865, 9877.26] - - [1024, 3450, 1, 4096] - - [822, 9283.11] + - [884, 9283.11] - - [4096, 3533, 1, 1024] - - [803, 9848.62] + - [865, 9848.62] - - [4096, 3390, 1, 1024] - - [815, 9764.61] + - [877, 9764.61] - - [4096, 3231, 1, 1024] - - [803, 9693.81] + - [865, 9693.81] - - [1024, 2499, 1, 4096] - - [822, 9304.81] + - [884, 9304.81] - - [1024, 3186, 1, 4096] - - [802, 8649.55] + - [864, 8649.55] - - [1024, 3380, 1, 4096] - - [823, 9101.77] + - [885, 9101.77] - - [4096, 3496, 1, 1024] - - [804, 9754.3] + - [866, 9754.3] - - [1024, 3956, 1, 33708] - - [803, 9636.77] + - [865, 9636.77] - - [1024, 3976, 1, 1024] - - [805, 9248.41] + - [867, 9248.41] - - [4096, 2736, 1, 1024] - - [803, 9651.91] + - [865, 9651.91] - - [1024, 3291, 1, 4096] - - [823, 8868.94] + - [885, 8868.94] - - [1024, 3944, 1, 33708] - - [804, 9607.0] + - [866, 9607.0] - - [1024, 3485, 1, 4096] - - [822, 9385.96] + - [884, 9385.96] - - [4096, 3138, 1, 1024] - - [800, 9672.15] + - [862, 9672.15] - - [1024, 3423, 1, 4096] - - [823, 9222.77] + - [885, 9222.77] - - [1024, 3491, 1, 4096] - - [823, 9405.02] + - [885, 9405.02] - - [1024, 3860, 1, 4096] - - [806, 9282.94] + - [868, 9282.94] - - [4096, 3211, 1, 1024] - - [803, 9640.42] + - [865, 9640.42] - - [1024, 3221, 1, 4096] - - [817, 8709.4] + - [879, 8709.4] - - [1024, 2917, 1, 4096] - - [805, 9177.11] + - [867, 9177.11] - - [4096, 3475, 1, 1024] - - [803, 9703.45] + - [865, 9703.45] - - [4096, 3524, 1, 1024] - - [803, 9816.23] + - [865, 9816.23] - - [4096, 2985, 1, 1024] - - [804, 9686.91] + - [866, 9686.91] - - [1024, 3480, 1, 4096] - - [823, 9380.2] + - [885, 9380.2] - - [4096, 3222, 1, 1024] - - [803, 9666.8] + - [865, 9666.8] - - [4096, 3451, 1, 1024] - - [799, 9877.91] + - [861, 9877.91] - - [1024, 3969, 1, 33708] - - [803, 9669.64] + - [865, 9669.64] - - [1024, 3640, 1, 1024] - - [810, 8565.68] + - [872, 8565.68] - - [1024, 3297, 1, 4096] - - [819, 8889.22] + - [881, 8889.22] - - [4096, 3944, 1, 1024] - - [800, 9902.85] + - [862, 9902.85] - - [1024, 3216, 1, 4096] - - [802, 8695.88] + - [864, 8695.88] - - [1024, 3840, 1, 1024] - - [816, 9046.05] + - [878, 9046.05] - - [4096, 3349, 1, 1024] - - [814, 9676.82] + - [876, 9676.82] - - [4096, 3398, 1, 1024] - - [800, 9775.84] + - [862, 9775.84] - - [1024, 3154, 1, 4096] - - [817, 8662.26] + - [879, 8662.26] - - [1024, 3978, 1, 33708] - - [804, 9689.16] + - [866, 9689.16] - - [1024, 3348, 1, 4096] - - [823, 9014.67] + - [885, 9014.67] - - [4096, 3304, 1, 1024] - - [804, 9886.8] + - [866, 9886.8] - - [4096, 4030, 1, 1024] - - [804, 9859.1] + - [866, 9859.1] - - [1024, 4026, 1, 1024] - - [801, 9326.64] + - [863, 9326.64] - - [4096, 3471, 1, 1024] - - [803, 9683.0] + - [865, 9683.0] - - [1024, 3259, 1, 4096] - - [817, 8792.19] + - [879, 8792.19] - - [64, 132, 480, 132] - - [857, 6027.86] + - [919, 6027.86] - - [1024, 3308, 1, 4096] - - [822, 8905.14] + - [884, 8905.14] - - [4096, 3391, 1, 1024] - - [815, 9765.35] + - [877, 9765.35] - - [1024, 3312, 1, 4096] - - [823, 8917.74] + - [885, 8917.74] - - [1024, 3502, 1, 4096] - - [823, 9435.62] + - [885, 9435.62] - - [1024, 3968, 1, 33708] - - [803, 9668.24] + - [865, 9668.24] - - [1024, 3424, 1, 4096] - - [819, 9215.99] + - [881, 9215.99] - - [64, 13, 4672, 13] - - [828, 1662.35] + - [890, 1662.35] - - [4096, 4032, 1, 1024] - - [814, 9877.82] + - [876, 9877.82] - - [1024, 3900, 1, 1024] - - [817, 9116.93] + - [879, 9116.93] - - [4096, 3442, 1, 1024] - - [814, 9773.18] + - [876, 9773.18] - - [1024, 3366, 1, 4096] - - [823, 9079.46] + - [885, 9079.46] - - [4096, 3999, 1, 1024] - - [803, 9786.46] + - [865, 9786.46] - - [1024, 3477, 1, 4096] - - [823, 9364.89] + - [885, 9364.89] - - [1024, 2505, 1, 4096] - - [823, 9304.03] + - [885, 9304.03] - - [4096, 3515, 1, 1024] - - [803, 9797.93] + - [865, 9797.93] - - [1024, 3564, 1, 4096] - - [819, 9632.86] + - [881, 9632.86] - - [4096, 3057, 1, 1024] - - [804, 9880.19] + - [866, 9880.19] - - [1024, 3339, 1, 4096] - - [802, 9029.86] + - [864, 9029.86] - - [4096, 3262, 1, 1024] - - [803, 9780.1] + - [865, 9780.1] - - [1024, 4030, 1, 4096] - - [806, 9682.0] + - [868, 9682.0] - - [1024, 3265, 1, 4096] - - [823, 8797.52] + - [885, 8797.52] - - [1024, 3459, 1, 4096] - - [823, 9313.06] + - [885, 9313.06] - - [4096, 3462, 1, 1024] - - [804, 9669.73] + - [866, 9669.73] - - [64, 85, 752, 85] - - [842, 5186.93] + - [904, 5186.93] - - [1024, 3513, 1, 4096] - - [820, 9469.15] + - [882, 9469.15] - - [1024, 3397, 1, 4096] - - [823, 9151.77] + - [885, 9151.77] - - [4096, 3572, 1, 1024] - - [803, 9945.7] + - [865, 9945.7] - - [4096, 3389, 1, 1024] - - [815, 9740.86] + - [877, 9740.86] - - [4096, 3438, 1, 1024] - - [815, 9822.47] + - [877, 9822.47] - - [64, 102, 624, 100] - - [850, 5487.0] + - [912, 5487.0] - - [1024, 3640, 1, 33708] - - [811, 9083.53] + - [873, 9083.53] - - [1024, 3995, 1, 33708] - - [804, 9731.99] + - [866, 9731.99] - - [1024, 3165, 1, 4096] - - [816, 8601.9] + - [878, 8601.9] - - [4096, 3543, 1, 1024] - - [804, 9868.63] + - [866, 9868.63] - - [4096, 3352, 1, 1024] - - [799, 9668.44] + - [861, 9668.44] - - [1024, 3359, 1, 4096] - - [820, 9050.33] + - [882, 9050.33] - - [1024, 3470, 1, 4096] - - [823, 9355.17] + - [885, 9355.17] - - [64, 15, 4096, 15] - - [827, 1945.43] + - [889, 1945.43] - - [1024, 3392, 1, 4096] - - [822, 9139.71] + - [884, 9139.71] - - [64, 78, 816, 77] - - [834, 4870.56] + - [896, 4870.56] - - [4096, 3137, 1, 1024] - - [799, 9600.22] + - [861, 9600.22] - - [4096, 3506, 1, 1024] - - [804, 9779.08] + - [866, 9779.08] - - [1024, 3095, 1, 4096] - - [816, 8381.24] + - [878, 8381.24] - - [1024, 3859, 1, 4096] - - [803, 9288.63] + - [865, 9288.63] - - [4096, 3369, 1, 1024] - - [815, 9697.73] + - [877, 9697.73] - - [64, 45, 1424, 45] - - [852, 3883.74] + - [914, 3883.74] - - [1024, 3435, 1, 4096] - - [823, 9264.62] + - [885, 9264.62] - - [1024, 3354, 1, 4096] - - [823, 9035.47] + - [885, 9035.47] - - [1024, 3055, 1, 4096] - - [804, 9597.45] + - [866, 9597.45] - - [4096, 3523, 1, 1024] - - [803, 9821.79] + - [865, 9821.79] - - [4096, 3380, 1, 1024] - - [799, 9721.39] + - [861, 9721.39] - - [1024, 3233, 1, 4096] - - [816, 8724.75] + - [878, 8724.75] - - [4096, 3221, 1, 1024] - - [803, 9661.04] + - [865, 9661.04] - - [4096, 3270, 1, 1024] - - [803, 9797.92] + - [865, 9797.92] - - [4096, 3593, 1, 1024] - - [814, 9679.31] + - [876, 9679.31] - - [1024, 3358, 1, 4096] - - [823, 9051.82] + - [885, 9051.82] - - [1024, 3540, 1, 4096] - - [823, 9533.59] + - [885, 9533.59] - - [4096, 3502, 1, 1024] - - [804, 9760.65] + - [866, 9760.65] - - [4096, 2505, 1, 1024] - - [804, 9680.52] + - [866, 9680.52] - - [4096, 3397, 1, 1024] - - [814, 9785.85] + - [876, 9785.85] - - [1024, 3300, 1, 4096] - - [817, 8907.85] + - [879, 8907.85] - - [4096, 3095, 1, 1024] - - [800, 9618.78] + - [862, 9618.78] - - [1024, 3182, 1, 4096] - - [816, 8606.16] + - [878, 8606.16] - - [1024, 3299, 1, 4096] - - [822, 8885.48] + - [884, 8885.48] - - [1024, 3276, 1, 4096] - - [817, 8872.75] + - [879, 8872.75] - - [1024, 3360, 1, 4096] - - [820, 9044.2] + - [882, 9044.2] - - [4096, 3360, 1, 1024] - - [815, 9681.39] + - [877, 9681.39] - - [4096, 2918, 1, 1024] - - [799, 9732.74] + - [861, 9732.74] - - [1024, 3939, 1, 33708] - - [803, 9595.96] + - [865, 9595.96] - - [4096, 3314, 1, 1024] - - [804, 9915.02] + - [866, 9915.02] - - [1024, 3319, 1, 4096] - - [823, 8956.37] + - [885, 8956.37] - - [64, 35, 1808, 35] - - [840, 3060.27] + - [902, 3060.27] - - [1024, 3942, 1, 1024] - - [816, 9211.83] + - [878, 9211.83] - - [1024, 3465, 1, 4096] - - [823, 9340.73] + - [885, 9340.73] - - [4096, 3546, 1, 1024] - - [804, 9875.41] + - [866, 9875.41] - - [1024, 3403, 1, 4096] - - [816, 9224.34] + - [878, 9224.34] - - [1024, 3948, 1, 1024] - - [802, 9245.63] + - [864, 9245.63] - - [4096, 3441, 1, 1024] - - [815, 9758.72] + - [877, 9758.72] - - [1024, 3139, 1, 4096] - - [816, 8582.84] + - [878, 8582.84] - - [1024, 3563, 1, 4096] - - [823, 9620.74] + - [885, 9620.74] - - [1024, 3508, 1, 4096] - - [820, 9449.36] + - [882, 9449.36] - - [1024, 3975, 1, 33708] - - [803, 9683.55] + - [865, 9683.55] - - [1024, 3446, 1, 4096] - - [822, 9289.51] + - [884, 9289.51] - - [1024, 3529, 1, 4096] - - [819, 9491.29] + - [881, 9491.29] - - [64, 112, 576, 112] - - [844, 6387.14] + - [906, 6387.14] - - [4096, 3461, 1, 1024] - - [804, 9663.33] + - [866, 9663.33] - - [1024, 3574, 1, 4096] - - [822, 9662.88] + - [884, 9662.88] - - [1024, 3101, 1, 4096] - - [817, 8468.34] + - [879, 8468.34] - - [1024, 3927, 1, 1024] - - [802, 9207.97] + - [864, 9207.97] - - [4096, 3224, 1, 1024] - - [804, 9665.61] + - [866, 9665.61] - - [4096, 3437, 1, 1024] - - [800, 9857.21] + - [862, 9857.21] - - [4096, 3900, 1, 1024] - - [815, 9826.25] + - [877, 9826.25] - - [1024, 3495, 1, 4096] - - [823, 9412.41] + - [885, 9412.41] - - [1024, 3977, 1, 33708] - - [803, 9687.87] + - [865, 9687.87] - - [1024, 3328, 1, 4096] - - [823, 8975.57] + - [885, 8975.57] - - [4096, 3168, 1, 1024] - - [799, 9754.87] + - [861, 9754.87] - - [1024, 4026, 1, 33708] - - [803, 9807.24] + - [865, 9807.24] - - [1024, 3292, 1, 4096] - - [816, 8901.83] + - [878, 8901.83] - - [1024, 3294, 1, 4096] - - [823, 8877.03] + - [885, 8877.03] - - [4096, 3335, 1, 1024] - - [800, 9616.23] + - [862, 9616.23] - - [4096, 3400, 1, 1024] - - [814, 9710.73] + - [876, 9710.73] - - [1024, 3287, 1, 4096] - - [801, 8908.07] + - [863, 8908.07] - - [1024, 3910, 1, 4096] - - [805, 9401.03] + - [867, 9401.03] - - [1024, 3780, 1, 1024] - - [816, 8863.29] + - [878, 8863.29] - - [4096, 3098, 1, 1024] - - [800, 9606.47] + - [862, 9606.47] - - [1024, 3584, 1, 33708] - - [823, 9775.33] + - [885, 9775.33] - - [64, 29, 2176, 29] - - [845, 3135.03] + - [907, 3135.03] - - [1024, 3371, 1, 4096] - - [801, 9117.81] + - [863, 9117.81] - - [1024, 3546, 1, 4096] - - [823, 9547.3] + - [885, 9547.3] - - [1024, 4012, 1, 1024] - - [805, 9353.73] + - [867, 9353.73] - - [4096, 3505, 1, 1024] - - [803, 9773.17] + - [865, 9773.17] - - [4096, 3554, 1, 1024] - - [803, 9895.59] + - [865, 9895.59] - - [4096, 3063, 1, 1024] - - [803, 9898.98] + - [865, 9898.98] - - [1024, 3900, 1, 33708] - - [804, 9502.93] + - [866, 9502.93] - - [1024, 3345, 1, 4096] - - [823, 9015.85] + - [885, 9015.85] - - [1024, 3357, 1, 4096] - - [823, 9041.23] + - [885, 9041.23] - - [1024, 3282, 1, 4096] - - [816, 8860.17] + - [878, 8860.17] - - [4096, 3484, 1, 1024] - - [804, 9721.33] + - [866, 9721.33] - - [1024, 3557, 1, 4096] - - [820, 9573.48] + - [882, 9573.48] - - [1024, 3476, 1, 4096] - - [823, 9361.72] + - [885, 9361.72] - - [1024, 3751, 1, 1024] - - [817, 8849.11] + - [879, 8849.11] - - [4096, 3379, 1, 1024] - - [800, 9741.49] + - [862, 9741.49] - - [4096, 3428, 1, 1024] - - [799, 9767.82] + - [861, 9767.82] - - [4096, 3126, 1, 1024] - - [814, 9701.9] + - [876, 9701.9] - - [64, 41, 1552, 41] - - [849, 3555.69] + - [911, 3555.69] - - [1024, 3325, 1, 4096] - - [801, 8962.41] + - [863, 8962.41] - - [4096, 3501, 1, 1024] - - [803, 9762.01] + - [865, 9762.01] - - [4096, 3358, 1, 1024] - - [799, 9680.42] + - [861, 9680.42] - - [1024, 3441, 1, 4096] - - [823, 9271.27] + - [885, 9271.27] - - [1024, 3552, 1, 4096] - - [819, 9565.42] + - [881, 9565.42] - - [4096, 3232, 1, 1024] - - [804, 9696.81] + - [866, 9696.81] - - [64, 18, 3440, 18] - - [824, 2059.33] + - [886, 2059.33] - - [1024, 3412, 1, 4096] - - [823, 9199.28] + - [885, 9199.28] - - [1024, 3372, 1, 4096] - - [820, 9083.49] + - [882, 9083.49] - - [1024, 3585, 1, 4096] - - [810, 8710.29] + - [872, 8710.29] - - [4096, 3143, 1, 1024] - - [815, 9692.12] + - [877, 9692.12] - - [4096, 3464, 1, 1024] - - [803, 9661.93] + - [865, 9661.93] - - [1024, 3145, 1, 4096] - - [802, 8526.33] + - [864, 8526.33] - - [4096, 3375, 1, 1024] - - [814, 9734.78] + - [876, 9734.78] - - [4096, 2917, 1, 1024] - - [799, 9714.57] + - [861, 9714.57] - - [4096, 3978, 1, 1024] - - [804, 9741.43] + - [866, 9741.43] - - [1024, 2765, 1, 4096] - - [805, 8706.75] + - [867, 8706.75] - - [64, 148, 432, 148] - - [830, 6372.17] + - [892, 6372.17] - - [1024, 3452, 1, 4096] - - [822, 9301.38] + - [884, 9301.38] - - [4096, 3584, 1, 1024] - - [804, 10005.7] + - [866, 10005.7] - - [4096, 3545, 1, 1024] - - [804, 9877.87] + - [866, 9877.87] - - [1024, 3352, 1, 4096] - - [823, 9035.19] + - [885, 9035.19] - - [64, 159, 400, 160] - - [832, 6952.11] + - [894, 6952.11] - - [4096, 3292, 1, 1024] - - [803, 9856.51] + - [865, 9856.51] - - [1024, 3525, 1, 4096] - - [823, 9501.5] + - [885, 9501.5] - - [1024, 3266, 1, 4096] - - [823, 8817.43] + - [885, 8817.43] - - [1024, 3382, 1, 4096] - - [822, 9101.54] + - [884, 9101.54] - - [4096, 3492, 1, 1024] - - [803, 9747.29] + - [865, 9747.29] - - [4096, 3419, 1, 1024] - - [815, 9745.88] + - [877, 9745.88] - - [1024, 3796, 1, 33708] - - [812, 9356.26] + - [874, 9356.26] - - [1024, 3293, 1, 4096] - - [819, 8868.4] + - [881, 8868.4] - - [4096, 3796, 1, 1024] - - [804, 9885.36] + - [866, 9885.36] - - [1024, 3487, 1, 4096] - - [820, 9391.34] + - [882, 9391.34] - - [4096, 3166, 1, 1024] - - [815, 9718.46] + - [877, 9718.46] - - [64, 102, 624, 101] - - [844, 5547.84] + - [906, 5547.84] - - [1024, 3409, 1, 4096] - - [823, 9187.88] + - [885, 9187.88] - - [1024, 3520, 1, 4096] - - [822, 9485.09] + - [884, 9485.09] - - [1024, 3573, 1, 4096] - - [823, 9652.71] + - [885, 9652.71] - - [4096, 3366, 1, 1024] - - [799, 9684.31] + - [861, 9684.31] - - [4096, 3720, 1, 1024] - - [815, 9703.34] + - [877, 9703.34] - - [4096, 3207, 1, 1024] - - [803, 9626.21] + - [865, 9626.21] - - [4096, 3272, 1, 1024] - - [803, 9795.51] + - [865, 9795.51] - - [1024, 3390, 1, 4096] - - [823, 9125.88] + - [885, 9125.88] - - [4096, 3183, 1, 1024] - - [815, 9825.87] + - [877, 9825.87] - - [4096, 3536, 1, 1024] - - [804, 9846.51] + - [866, 9846.51] - - [4096, 3563, 1, 1024] - - [804, 9913.8] + - [866, 9913.8] - - [1024, 3482, 1, 4096] - - [823, 9376.91] + - [885, 9376.91] - - [4096, 3447, 1, 1024] - - [814, 9875.09] + - [876, 9875.09] - - [4096, 3955, 1, 1024] - - [799, 9922.39] + - [861, 9922.39] - - [4096, 4005, 1, 1024] - - [804, 9803.43] + - [866, 9803.43] - - [1024, 3493, 1, 4096] - - [823, 9411.37] + - [885, 9411.37] - - [4096, 3410, 1, 1024] - - [799, 9788.34] + - [861, 9788.34] - - [1024, 3422, 1, 4096] - - [822, 9216.28] + - [884, 9216.28] - - [1024, 3350, 1, 4096] - - [817, 9068.02] + - [879, 9068.02] - - [4096, 3300, 1, 1024] - - [804, 9883.29] + - [866, 9883.29] - - [4096, 3910, 1, 1024] - - [814, 9800.12] + - [876, 9800.12] - - [1024, 3489, 1, 4096] - - [823, 9398.66] + - [885, 9398.66] - - [4096, 3483, 1, 1024] - - [803, 9715.96] + - [865, 9715.96] - - [4096, 3532, 1, 1024] - - [804, 9837.99] + - [866, 9837.99] - - [64, 101, 624, 101] - - [844, 5452.28] + - [906, 5452.28] - - [4096, 3230, 1, 1024] - - [804, 9683.6] + - [866, 9683.6] - - [4096, 3427, 1, 1024] - - [799, 9760.72] + - [861, 9760.72] - - [1024, 3377, 1, 4096] - - [823, 9101.17] + - [885, 9101.17] - - [1024, 3488, 1, 4096] - - [822, 9381.99] + - [884, 9381.99] - - [1024, 3616, 1, 4096] - - [805, 8709.33] + - [867, 8709.33] - - [1024, 3426, 1, 4096] - - [823, 9229.43] + - [885, 9229.43] - - [4096, 3357, 1, 1024] - - [815, 9668.5] + - [877, 9668.5] - - [4096, 3406, 1, 1024] - - [800, 9748.57] + - [862, 9748.57] - - [1024, 3046, 1, 4096] - - [805, 9590.43] + - [867, 9590.43] - - [1024, 3272, 1, 4096] - - [816, 8930.2] + - [878, 8930.2] - - [1024, 3256, 1, 4096] - - [801, 8828.16] + - [863, 8828.16] - - [4096, 3247, 1, 1024] - - [803, 9741.81] + - [865, 9741.81] - - [4096, 3088, 1, 1024] - - [815, 9589.07] + - [877, 9589.07] - - [1024, 3531, 1, 4096] - - [822, 9501.06] + - [884, 9501.06] - - [64, 160, 400, 160] - - [858, 7334.03] + - [920, 7334.03] - - [4096, 3511, 1, 1024] - - [804, 9789.38] + - [866, 9789.38] - - [1024, 3720, 1, 33708] - - [813, 9214.68] + - [875, 9214.68] - - [1024, 3267, 1, 4096] - - [816, 8831.04] + - [878, 8831.04] - - [1024, 3270, 1, 4096] - - [817, 8876.68] + - [879, 8876.68] - - [1024, 3461, 1, 4096] - - [822, 9327.55] + - [884, 9327.55] - - [4096, 3474, 1, 1024] - - [803, 9697.04] + - [865, 9697.04] - - [4096, 2984, 1, 1024] - - [804, 9674.08] + - [866, 9674.08] - - [1024, 3399, 1, 4096] - - [822, 9158.58] + - [884, 9158.58] - - [4096, 3574, 1, 1024] - - [803, 9942.3] + - [865, 9942.3] - - [1024, 3876, 1, 1024] - - [817, 9085.13] + - [879, 9085.13] - - [4096, 3337, 1, 1024] - - [800, 9611.43] + - [862, 9611.43] - - [4096, 3450, 1, 1024] - - [815, 9930.35] + - [877, 9930.35] - - [1024, 3720, 1, 1024] - - [801, 8755.49] + - [863, 8755.49] - - [1024, 4059, 1, 1024] - - [806, 9366.67] + - [868, 9366.67] - - [4096, 3291, 1, 1024] - - [803, 9856.33] + - [865, 9856.33] - - [64, 93, 688, 93] - - [847, 5497.11] + - [909, 5497.11] - - [4096, 3995, 1, 1024] - - [803, 9776.67] + - [865, 9776.67] - - [64, 147, 432, 147] - - [833, 6233.88] + - [895, 6233.88] - - [4096, 3491, 1, 1024] - - [803, 9742.94] + - [865, 9742.94] - - [4096, 3348, 1, 1024] - - [815, 9634.11] + - [877, 9634.11] - - [4096, 3925, 1, 1024] - - [814, 9848.54] + - [876, 9848.54] - - [4096, 3894, 1, 1024] - - [814, 9812.55] + - [876, 9812.55] - - [1024, 3456, 1, 4096] - - [823, 9317.91] + - [885, 9317.91] - - [1024, 3394, 1, 4096] - - [822, 9148.86] + - [884, 9148.86] - - [64, 100, 624, 102] - - [844, 5416.95] + - [906, 5416.95] - - [4096, 3165, 1, 1024] - - [814, 9743.35] + - [876, 9743.35] - - [4096, 3470, 1, 1024] - - [804, 9691.04] + - [866, 9691.04] - - [1024, 3014, 1, 4096] - - [805, 9486.26] + - [867, 9486.26] - - [1024, 3375, 1, 4096] - - [823, 9082.71] + - [885, 9082.71] - - [4096, 3859, 1, 1024] - - [814, 9738.87] + - [876, 9738.87] - - [4096, 3365, 1, 1024] - - [815, 9694.74] + - [877, 9694.74] - - [1024, 3162, 1, 4096] - - [816, 8550.31] + - [878, 8550.31] - - [1024, 3840, 1, 33708] - - [813, 9409.08] + - [875, 9409.08] - - [1024, 3437, 1, 4096] - - [823, 9270.49] + - [885, 9270.49] - - [4096, 3319, 1, 1024] - - [804, 9927.15] + - [866, 9927.15] - - [1024, 3320, 1, 4096] - - [823, 8962.29] + - [885, 8962.29] - - [64, 23, 2720, 23] - - [846, 2569.53] + - [908, 2569.53] - - [4096, 3328, 1, 1024] - - [803, 9997.41] + - [865, 9997.41] - - [1024, 3235, 1, 4096] - - [823, 8724.31] + - [885, 8724.31] - - [4096, 3282, 1, 1024] - - [804, 9827.13] + - [866, 9827.13] - - [1024, 3367, 1, 4096] - - [816, 9084.02] + - [878, 9084.02] - - [1024, 3542, 1, 4096] - - [823, 9533.1] + - [885, 9533.1] - - [64, 177, 352, 177] - - [809, 6817.91] + - [871, 6817.91] - - [4096, 3145, 1, 1024] - - [800, 9710.28] + - [862, 9710.28] - - [4096, 3514, 1, 1024] - - [803, 9793.06] + - [865, 9793.06] - - [1024, 3432, 1, 4096] - - [823, 9249.39] + - [885, 9249.39] - - [4096, 3409, 1, 1024] - - [799, 9721.6] + - [861, 9721.6] - - [1024, 4012, 1, 33708] - - [803, 9773.35] + - [865, 9773.35] - - [4096, 3876, 1, 1024] - - [800, 9745.65] + - [862, 9745.65] - - [4096, 3299, 1, 1024] - - [803, 9873.53] + - [865, 9873.53] - - [1024, 3168, 1, 4096] - - [816, 8597.13] + - [878, 8597.13] - - [4096, 3681, 1, 1024] - - [815, 9840.03] + - [877, 9840.03] - - [4096, 3531, 1, 1024] - - [804, 9847.76] + - [866, 9847.76] - - [4096, 3388, 1, 1024] - - [815, 9772.28] + - [877, 9772.28] - - [1024, 3720, 1, 4096] - - [804, 8951.6] + - [866, 8951.6] - - [1024, 3332, 1, 4096] - - [823, 8978.97] + - [885, 8978.97] - - [1024, 3273, 1, 4096] - - [817, 8982.49] + - [879, 8982.49] - - [1024, 2935, 1, 4096] - - [806, 9224.89] + - [868, 9224.89] - - [1024, 3467, 1, 4096] - - [820, 9329.33] + - [882, 9329.33] - - [4096, 3542, 1, 1024] - - [803, 9858.51] + - [865, 9858.51] - - [1024, 3130, 1, 4096] - - [802, 8526.66] + - [864, 8526.66] - - [1024, 3405, 1, 4096] - - [823, 9163.44] + - [885, 9163.44] - - [1024, 3960, 1, 1024] - - [801, 9280.36] + - [863, 9280.36] - - [4096, 3405, 1, 1024] - - [814, 9710.2] + - [876, 9710.2] - - [512, 512, 1, 1024] - - [1000, 6670.96] + - [1062, 6670.96] - - [8, 500, 1, 512] - - [896, 228.671] + - [958, 228.671] - - [512, 512, 1, 2000] - - [1033, 7629.44] + - [1095, 7629.44] - - [32, 512, 1, 512] - - [893, 904.045] + - [955, 904.045] - - [100, 1024, 1, 2048] - - [955, 3196.98] + - [1017, 3196.98] - - [8, 512, 1, 500] - - [886, 237.137] + - [948, 237.137] - - [8, 500, 1, 1024] - - [950, 289.366] + - [1012, 289.366] - - [100, 2000, 1, 1024] - - [989, 3368.52] + - [1051, 3368.52] - - [64, 1024, 1, 100] - - [888, 941.709] + - [950, 941.709] - - [64, 1024, 1, 500] - - [1015, 2659.84] + - [1077, 2659.84] - - [64, 1024, 1, 1024] - - [953, 2452.91] + - [1015, 2452.91] - - [128, 2000, 1, 100] - - [1009, 2560.1] + - [1071, 2560.1] - - [2, 500, 1, 2048] - - [950, 72.2127] + - [1012, 72.2127] - - [16, 512, 1, 10] - - [864, 18.3857] + - [926, 18.3857] - - [64, 2000, 1, 1024] - - [1020, 2800.78] + - [1082, 2800.78] - - [100, 1024, 1, 1024] - - [948, 3034.17] + - [1010, 3034.17] - - [8, 512, 1, 10] - - [926, 9.24286] + - [988, 9.24286] - - [16, 500, 1, 2048] - - [950, 565.846] + - [1012, 565.846] - - [10, 100, 1, 500] - - [886, 58.5112] + - [948, 58.5112] - - [16, 100, 1, 10] - - [926, 3.67143] + - [988, 3.67143] - - [500, 1024, 1, 512] - - [1016, 6514.61] + - [1078, 6514.61] - - [128, 1024, 1, 512] - - [1034, 4194.4] + - [1096, 4194.4] - - [512, 500, 1, 2000] - - [992, 7347.98] + - [1054, 7347.98] - - [2, 100, 1, 2000] - - [886, 20.9333] + - [948, 20.9333] - - [500, 512, 1, 100] - - [1008, 2539.78] + - [1070, 2539.78] - - [100, 1024, 1, 500] - - [1034, 3216.18] + - [1096, 3216.18] - - [256, 100, 1, 2048] - - [1044, 1689.17] + - [1106, 1689.17] - - [2, 512, 1, 512] - - [900, 50.5123] + - [962, 50.5123] - - [128, 2000, 1, 512] - - [1020, 4641.46] + - [1082, 4641.46] - - [2, 100, 1, 10] - - [864, 0.496825] + - [926, 0.496825] - - [16, 2000, 1, 2048] - - [908, 1266.25] + - [970, 1266.25] - - [200, 100, 1, 100] - - [1054, 316.556] + - [1116, 316.556] - - [256, 1024, 1, 100] - - [1010, 2686.0] + - [1072, 2686.0] - - [200, 500, 1, 1024] - - [1059, 3282.15] + - [1121, 3282.15] - - [500, 100, 1, 100] - - [973, 631.413] + - [1035, 631.413] - - [4, 100, 1, 10] - - [871, 0.977193] + - [933, 0.977193] - - [32, 100, 1, 512] - - [950, 198.935] + - [1012, 198.935] - - [100, 2000, 1, 512] - - [1020, 3832.44] + - [1082, 3832.44] - - [16, 1024, 1, 512] - - [934, 794.476] + - [996, 794.476] - - [200, 512, 1, 100] - - [1052, 1306.22] + - [1114, 1306.22] - - [4, 1024, 1, 1024] - - [893, 213.225] + - [955, 213.225] - - [512, 1024, 1, 512] - - [1017, 7049.35] + - [1079, 7049.35] - - [4, 512, 1, 10] - - [925, 4.59123] + - [987, 4.59123] - - [2, 2048, 1, 2000] - - [886, 300.393] + - [948, 300.393] - - [64, 2048, 1, 10] - - [1046, 241.041] + - [1108, 241.041] - - [128, 100, 1, 10] - - [1051, 27.6862] + - [1113, 27.6862] - - [4, 512, 1, 2048] - - [886, 146.549] + - [948, 146.549] - - [64, 2048, 1, 500] - - [1026, 4015.79] + - [1088, 4015.79] - - [512, 512, 1, 512] - - [981, 6123.17] + - [1043, 6123.17] - - [500, 500, 1, 2000] - - [992, 7126.67] + - [1054, 7126.67] - - [10, 1024, 1, 2000] - - [959, 807.671] + - [1021, 807.671] - - [256, 100, 1, 100] - - [971, 296.396] + - [1033, 296.396] - - [32, 2000, 1, 2048] - - [914, 2167.3] + - [976, 2167.3] - - [64, 1024, 1, 2048] - - [947, 2383.23] + - [1009, 2383.23] - - [200, 2048, 1, 512] - - [1022, 5264.04] + - [1084, 5264.04] - - [256, 500, 1, 10] - - [1004, 210.626] + - [1066, 210.626] - - [16, 1024, 1, 100] - - [884, 262.664] + - [946, 262.664] - - [32, 1024, 1, 1024] - - [889, 1476.97] + - [951, 1476.97] - - [512, 500, 1, 512] - - [978, 5851.53] + - [1040, 5851.53] - - [128, 1024, 1, 2000] - - [1062, 5516.6] + - [1124, 5516.6] - - [8, 100, 1, 500] - - [886, 46.3963] + - [948, 46.3963] - - [100, 2000, 1, 2048] - - [1041, 3715.63] + - [1103, 3715.63] - - [10, 512, 1, 512] - - [896, 292.671] + - [958, 292.671] - - [8, 500, 1, 10] - - [925, 8.87193] + - [987, 8.87193] - - [10, 2000, 1, 1024] - - [939, 640.1] + - [1001, 640.1] - - [16, 1024, 1, 10] - - [924, 36.6714] + - [986, 36.6714] - - [16, 512, 1, 2048] - - [903, 585.897] + - [965, 585.897] - - [256, 512, 1, 10] - - [969, 230.861] + - [1031, 230.861] - - [2, 2000, 1, 100] - - [931, 64.2026] + - [993, 64.2026] - - [128, 512, 1, 2048] - - [898, 3106.99] + - [960, 3106.99] - - [128, 512, 1, 100] - - [891, 952.658] + - [953, 952.658] - - [512, 2000, 1, 1024] - - [988, 8066.07] + - [1050, 8066.07] - - [64, 500, 1, 2048] - - [1057, 1857.7] + - [1119, 1857.7] - - [64, 2000, 1, 2048] - - [1039, 3442.12] + - [1101, 3442.12] - - [64, 2048, 1, 512] - - [1040, 3315.76] + - [1102, 3315.76] - - [10, 2000, 1, 512] - - [886, 785.376] + - [948, 785.376] - - [32, 2000, 1, 500] - - [889, 2500.1] + - [951, 2500.1] - - [64, 2000, 1, 10] - - [877, 231.984] + - [939, 231.984] - - [500, 100, 1, 10] - - [974, 88.1282] + - [1036, 88.1282] - - [128, 1024, 1, 500] - - [1025, 4096.1] + - [1087, 4096.1] - - [64, 100, 1, 2048] - - [886, 587.34] + - [948, 587.34] - - [64, 100, 1, 10] - - [1045, 12.0403] + - [1107, 12.0403] - - [16, 512, 1, 500] - - [896, 461.361] + - [958, 461.361] - - [32, 2000, 1, 1024] - - [883, 1713.91] + - [945, 1713.91] - - [200, 512, 1, 1024] - - [1062, 3244.46] + - [1124, 3244.46] - - [128, 2048, 1, 10] - - [878, 455.211] + - [940, 455.211] - - [200, 100, 1, 2000] - - [886, 1462.09] + - [948, 1462.09] - - [2, 100, 1, 512] - - [886, 12.5272] + - [948, 12.5272] - - [64, 2048, 1, 100] - - [1052, 1689.17] + - [1114, 1689.17] - - [32, 512, 1, 100] - - [885, 266.074] + - [947, 266.074] - - [16, 512, 1, 1024] - - [950, 569.978] + - [1012, 569.978] - - [4, 1024, 1, 512] - - [940, 208.151] + - [1002, 208.151] - - [64, 2000, 1, 100] - - [1052, 1649.58] + - [1114, 1649.58] - - [512, 2048, 1, 512] - - [988, 7849.09] + - [1050, 7849.09] - - [2, 500, 1, 500] - - [874, 53.5188] + - [936, 53.5188] - - [32, 100, 1, 100] - - [885, 57.2429] + - [947, 57.2429] - - [100, 500, 1, 2000] - - [889, 2784.06] + - [951, 2784.06] - - [200, 2000, 1, 100] - - [961, 2994.11] + - [1023, 2994.11] - - [10, 512, 1, 10] - - [921, 11.1345] + - [983, 11.1345] - - [100, 500, 1, 2048] - - [1061, 2361.72] + - [1123, 2361.72] - - [4, 2048, 1, 500] - - [896, 379.359] + - [958, 379.359] - - [200, 500, 1, 100] - - [1022, 1288.76] + - [1084, 1288.76] - - [500, 500, 1, 500] - - [978, 5425.45] + - [1040, 5425.45] - - [2, 100, 1, 1024] - - [950, 16.3025] + - [1012, 16.3025] - - [128, 2048, 1, 512] - - [1036, 4699.6] + - [1098, 4699.6] - - [200, 2000, 1, 1024] - - [986, 4621.04] + - [1048, 4621.04] - - [32, 512, 1, 1024] - - [949, 1028.12] + - [1011, 1028.12] - - [100, 2048, 1, 500] - - [1010, 4142.49] + - [1072, 4142.49] - - [256, 100, 1, 1024] - - [1040, 1443.62] + - [1102, 1443.62] - - [16, 2000, 1, 500] - - [935, 1428.67] + - [997, 1428.67] - - [128, 100, 1, 100] - - [885, 213.433] + - [947, 213.433] - - [500, 500, 1, 2048] - - [982, 6639.1] + - [1044, 6639.1] - - [32, 512, 1, 10] - - [918, 36.0298] + - [980, 36.0298] - - [128, 100, 1, 1024] - - [946, 791.598] + - [1008, 791.598] - - [16, 500, 1, 2000] - - [959, 694.544] + - [1021, 694.544] - - [4, 2048, 1, 100] - - [930, 129.72] + - [992, 129.72] - - [64, 500, 1, 500] - - [872, 1333.43] + - [934, 1333.43] - - [500, 1024, 1, 2048] - - [991, 7031.86] + - [1053, 7031.86] - - [512, 2048, 1, 100] - - [966, 5285.26] + - [1028, 5285.26] - - [128, 512, 1, 1024] - - [1058, 2519.2] + - [1120, 2519.2] - - [128, 512, 1, 2000] - - [1056, 3608.91] + - [1118, 3608.91] - - [128, 2000, 1, 2000] - - [1029, 7017.64] + - [1091, 7017.64] - - [2, 512, 1, 10] - - [922, 2.13175] + - [984, 2.13175] - - [10, 512, 1, 500] - - [886, 293.678] + - [948, 293.678] - - [4, 1024, 1, 2000] - - [906, 326.215] + - [968, 326.215] - - [256, 100, 1, 2000] - - [1043, 1768.06] + - [1105, 1768.06] - - [512, 2048, 1, 2000] - - [988, 8674.62] + - [1050, 8674.62] - - [100, 100, 1, 10] - - [1050, 21.6517] + - [1112, 21.6517] - - [256, 500, 1, 1024] - - [990, 4833.14] + - [1052, 4833.14] - - [128, 512, 1, 10] - - [878, 132.229] + - [940, 132.229] - - [256, 100, 1, 500] - - [1037, 914.386] + - [1099, 914.386] - - [64, 100, 1, 512] - - [944, 369.109] + - [1006, 369.109] - - [64, 512, 1, 500] - - [886, 1600.1] + - [948, 1600.1] - - [64, 2048, 1, 2000] - - [1040, 5925.6] + - [1102, 5925.6] - - [100, 2048, 1, 1024] - - [998, 3260.6] + - [1060, 3260.6] - - [200, 2000, 1, 10] - - [878, 595.338] + - [940, 595.338] - - [128, 1024, 1, 100] - - [1022, 1689.17] + - [1084, 1689.17] - - [16, 2000, 1, 100] - - [885, 493.927] + - [947, 493.927] - - [8, 100, 1, 512] - - [886, 49.8087] + - [948, 49.8087] - - [500, 2048, 1, 1024] - - [988, 7651.71] + - [1050, 7651.71] - - [500, 2000, 1, 10] - - [976, 1008.16] + - [1038, 1008.16] - - [32, 100, 1, 500] - - [950, 187.016] + - [1012, 187.016] - - [256, 1024, 1, 2048] - - [991, 6190.95] + - [1053, 6190.95] - - [32, 500, 1, 2048] - - [886, 1083.7] + - [948, 1083.7] - - [4, 2000, 1, 10] - - [929, 17.6439] + - [991, 17.6439] - - [128, 500, 1, 2000] - - [946, 3516.58] + - [1008, 3516.58] - - [8, 1024, 1, 10] - - [920, 18.0649] + - [982, 18.0649] - - [2, 500, 1, 100] - - [865, 16.1256] + - [927, 16.1256] - - [10, 500, 1, 512] - - [886, 291.009] + - [948, 291.009] - - [10, 2000, 1, 10] - - [864, 38.5615] + - [926, 38.5615] - - [500, 512, 1, 512] - - [981, 5893.63] + - [1043, 5893.63] - - [32, 500, 1, 500] - - [886, 892.957] + - [948, 892.957] - - [256, 500, 1, 2000] - - [995, 6237.92] + - [1057, 6237.92] - - [100, 500, 1, 100] - - [897, 726.844] + - [959, 726.844] - - [500, 2048, 1, 100] - - [970, 4867.02] + - [1032, 4867.02] - - [10, 1024, 1, 512] - - [886, 520.227] + - [948, 520.227] - - [2, 2048, 1, 512] - - [896, 151.628] + - [958, 151.628] - - [256, 512, 1, 100] - - [975, 1590.78] + - [1037, 1590.78] - - [10, 2048, 1, 100] - - [886, 324.151] + - [948, 324.151] - - [8, 2048, 1, 100] - - [941, 256.1] + - [1003, 256.1] - - [512, 100, 1, 512] - - [1037, 2100.61] + - [1099, 2100.61] - - [4, 500, 1, 500] - - [886, 115.841] + - [948, 115.841] - - [64, 100, 1, 1024] - - [886, 450.21] + - [948, 450.21] - - [2, 2048, 1, 1024] - - [943, 137.708] + - [1005, 137.708] - - [2, 500, 1, 2000] - - [912, 90.3527] + - [974, 90.3527] - - [512, 1024, 1, 500] - - [1017, 6898.63] + - [1079, 6898.63] - - [128, 2000, 1, 500] - - [1022, 5161.39] + - [1084, 5161.39] - - [32, 512, 1, 2048] - - [956, 1103.86] + - [1018, 1103.86] - - [10, 100, 1, 2000] - - [886, 106.032] + - [948, 106.032] - - [4, 100, 1, 512] - - [886, 24.7154] + - [948, 24.7154] - - [2, 512, 1, 2048] - - [950, 73.3246] + - [1012, 73.3246] - - [200, 512, 1, 2048] - - [1062, 3954.01] + - [1124, 3954.01] - - [200, 2000, 1, 2000] - - [1024, 6230.63] + - [1086, 6230.63] - - [100, 100, 1, 2000] - - [886, 827.915] + - [948, 827.915] - - [500, 2048, 1, 2000] - - [987, 8388.04] + - [1049, 8388.04] - - [64, 2048, 1, 2048] - - [1032, 3406.64] + - [1094, 3406.64] - - [16, 2000, 1, 1024] - - [892, 1024.1] + - [954, 1024.1] - - [512, 2048, 1, 1024] - - [965, 8061.22] + - [1027, 8061.22] - - [10, 500, 1, 500] - - [896, 284.191] + - [958, 284.191] - - [200, 1024, 1, 2048] - - [1060, 4886.29] + - [1122, 4886.29] - - [10, 2000, 1, 2000] - - [886, 1449.38] + - [948, 1449.38] - - [8, 2000, 1, 500] - - [935, 719.524] + - [997, 719.524] - - [2, 100, 1, 2048] - - [950, 19.945] + - [1012, 19.945] - - [32, 100, 1, 2048] - - [950, 323.894] + - [1012, 323.894] - - [512, 512, 1, 10] - - [1007, 420.203] + - [1069, 420.203] - - [512, 500, 1, 10] - - [1012, 376.571] + - [1074, 376.571] - - [16, 100, 1, 1024] - - [896, 129.72] + - [958, 129.72] - - [2, 500, 1, 10] - - [860, 2.21864] + - [922, 2.21864] - - [200, 512, 1, 10] - - [862, 188.335] + - [924, 188.335] - - [512, 1024, 1, 100] - - [962, 3877.97] + - [1024, 3877.97] - - [16, 2000, 1, 2000] - - [886, 2222.32] + - [948, 2222.32] - - [500, 500, 1, 1024] - - [982, 6130.37] + - [1044, 6130.37] - - [500, 100, 1, 2048] - - [1037, 2949.41] + - [1099, 2949.41] - - [256, 1024, 1, 512] - - [1001, 5886.84] + - [1063, 5886.84] - - [256, 500, 1, 512] - - [979, 4380.85] + - [1041, 4380.85] - - [16, 1024, 1, 2000] - - [950, 1208.36] + - [1012, 1208.36] - - [200, 500, 1, 2048] - - [1062, 3855.52] + - [1124, 3855.52] - - [256, 2000, 1, 10] - - [964, 727.373] + - [1026, 727.373] - - [10, 2048, 1, 2048] - - [917, 823.158] + - [979, 823.158] - - [512, 2000, 1, 100] - - [966, 5120.1] + - [1028, 5120.1] - - [10, 1024, 1, 1024] - - [893, 553.146] + - [955, 553.146] - - [512, 2000, 1, 2048] - - [994, 7563.4] + - [1056, 7563.4] - - [500, 1024, 1, 500] - - [1018, 6570.94] + - [1080, 6570.94] - - [500, 100, 1, 512] - - [1037, 2038.32] + - [1099, 2038.32] - - [256, 2000, 1, 100] - - [986, 3764.81] + - [1048, 3764.81] - - [512, 1024, 1, 2048] - - [1030, 7286.62] + - [1092, 7286.62] - - [32, 512, 1, 500] - - [886, 898.346] + - [948, 898.346] - - [100, 2000, 1, 10] - - [878, 333.433] + - [940, 333.433] - - [100, 500, 1, 512] - - [1056, 2176.97] + - [1118, 2176.97] - - [8, 2000, 1, 512] - - [935, 602.453] + - [997, 602.453] - - [100, 2048, 1, 2048] - - [1042, 3694.87] + - [1104, 3694.87] - - [128, 1024, 1, 2048] - - [1061, 4168.35] + - [1123, 4168.35] - - [8, 500, 1, 2000] - - [960, 352.213] + - [1022, 352.213] - - [100, 2000, 1, 500] - - [1010, 4045.41] + - [1072, 4045.41] - - [100, 2048, 1, 100] - - [1010, 2081.4] + - [1072, 2081.4] - - [4, 100, 1, 1024] - - [886, 33.1323] + - [948, 33.1323] - - [500, 2048, 1, 2048] - - [994, 7765.03] + - [1056, 7765.03] - - [2, 2000, 1, 2048] - - [905, 166.334] + - [967, 166.334] - - [200, 2048, 1, 10] - - [879, 609.624] + - [941, 609.624] - - [2, 500, 1, 1024] - - [950, 75.3941] + - [1012, 75.3941] - - [100, 500, 1, 1024] - - [946, 1975.41] + - [1008, 1975.41] - - [16, 2048, 1, 500] - - [886, 1473.48] + - [948, 1473.48] - - [100, 1024, 1, 10] - - [1046, 185.607] + - [1108, 185.607] - - [8, 2048, 1, 1024] - - [942, 543.404] + - [1004, 543.404] - - [2, 2000, 1, 500] - - [886, 179.956] + - [948, 179.956] - - [32, 100, 1, 1024] - - [886, 267.812] + - [948, 267.812] - - [500, 2000, 1, 512] - - [1016, 7087.59] + - [1078, 7087.59] - - [64, 100, 1, 2000] - - [896, 615.485] + - [958, 615.485] - - [100, 1024, 1, 2000] - - [1059, 4224.52] + - [1121, 4224.52] - - [64, 500, 1, 10] - - [861, 63.5921] + - [923, 63.5921] - - [32, 2048, 1, 100] - - [882, 941.709] + - [944, 941.709] - - [64, 500, 1, 512] - - [886, 1575.48] + - [948, 1575.48] - - [10, 100, 1, 1024] - - [896, 82.6806] + - [958, 82.6806] - - [16, 512, 1, 100] - - [885, 148.506] + - [947, 148.506] - - [4, 100, 1, 2000] - - [959, 43.9597] + - [1021, 43.9597] - - [2, 512, 1, 1024] - - [950, 74.152] + - [1012, 74.152] - - [64, 512, 1, 1024] - - [951, 1571.0] + - [1013, 1571.0] - - [10, 2048, 1, 500] - - [886, 920.963] + - [948, 920.963] - - [4, 2000, 1, 2048] - - [905, 326.215] + - [967, 326.215] - - [512, 100, 1, 2048] - - [1040, 3084.15] + - [1102, 3084.15] - - [32, 100, 1, 2000] - - [886, 343.448] + - [948, 343.448] - - [256, 512, 1, 500] - - [979, 4311.68] + - [1041, 4311.68] - - [100, 2000, 1, 100] - - [1010, 2016.23] + - [1072, 2016.23] - - [8, 2000, 1, 1024] - - [899, 544.781] + - [961, 544.781] - - [4, 512, 1, 500] - - [886, 118.619] + - [948, 118.619] - - [128, 1024, 1, 10] - - [1049, 244.637] + - [1111, 244.637] - - [4, 500, 1, 1024] - - [886, 144.733] + - [948, 144.733] - - [32, 2048, 1, 512] - - [889, 2140.05] + - [951, 2140.05] - - [32, 100, 1, 10] - - [864, 7.11754] + - [926, 7.11754] - - [100, 2048, 1, 10] - - [1053, 341.433] + - [1115, 341.433] - - [512, 500, 1, 100] - - [1014, 2461.64] + - [1076, 2461.64] - - [128, 2000, 1, 1024] - - [998, 4174.37] + - [1060, 4174.37] - - [200, 1024, 1, 500] - - [1010, 4295.4] + - [1072, 4295.4] - - [32, 2048, 1, 1024] - - [913, 1667.82] + - [975, 1667.82] - - [10, 1024, 1, 2048] - - [904, 555.49] + - [966, 555.49] - - [8, 500, 1, 100] - - [885, 71.5286] + - [947, 71.5286] - - [32, 2048, 1, 500] - - [889, 2528.5] + - [951, 2528.5] - - [200, 100, 1, 1024] - - [898, 1071.23] + - [960, 1071.23] - - [16, 100, 1, 100] - - [875, 28.6714] + - [937, 28.6714] - - [8, 1024, 1, 2000] - - [959, 654.413] + - [1021, 654.413] - - [4, 512, 1, 100] - - [885, 36.6714] + - [947, 36.6714] - - [16, 500, 1, 100] - - [885, 142.957] + - [947, 142.957] - - [8, 1024, 1, 2048] - - [911, 441.606] + - [973, 441.606] - - [16, 1024, 1, 2048] - - [912, 886.845] + - [974, 886.845] - - [10, 2048, 1, 1024] - - [890, 639.476] + - [952, 639.476] - - [64, 512, 1, 100] - - [885, 518.581] + - [947, 518.581] - - [2, 100, 1, 500] - - [886, 9.71538] + - [948, 9.71538] - - [2, 500, 1, 512] - - [892, 48.2203] + - [954, 48.2203] - - [256, 512, 1, 2000] - - [995, 6450.49] + - [1057, 6450.49] - - [128, 500, 1, 1024] - - [889, 2497.66] + - [951, 2497.66] - - [10, 100, 1, 10] - - [926, 2.33214] + - [988, 2.33214] - - [8, 2048, 1, 2048] - - [876, 643.398] + - [938, 643.398] - - [16, 2048, 1, 2048] - - [916, 1338.0] + - [978, 1338.0] - - [64, 1024, 1, 10] - - [879, 132.229] + - [941, 132.229] - - [500, 100, 1, 500] - - [1037, 1941.09] + - [1099, 1941.09] - - [256, 1024, 1, 2000] - - [1033, 7629.44] + - [1095, 7629.44] - - [200, 512, 1, 500] - - [1022, 3232.42] + - [1084, 3232.42] - - [8, 2000, 1, 10] - - [923, 32.3581] + - [985, 32.3581] - - [64, 2000, 1, 512] - - [1021, 3225.3] + - [1083, 3225.3] - - [2, 512, 1, 100] - - [865, 16.7234] + - [927, 16.7234] - - [4, 2000, 1, 2000] - - [886, 586.61] + - [948, 586.61] - - [200, 1024, 1, 100] - - [1010, 2133.43] + - [1072, 2133.43] - - [16, 100, 1, 500] - - [950, 92.6926] + - [1012, 92.6926] - - [128, 100, 1, 500] - - [946, 526.416] + - [1008, 526.416] - - [500, 1024, 1, 1024] - - [980, 7201.86] + - [1042, 7201.86] - - [200, 1024, 1, 1024] - - [1032, 4519.82] + - [1094, 4519.82] - - [8, 2048, 1, 512] - - [896, 624.252] + - [958, 624.252] - - [200, 2000, 1, 500] - - [986, 5186.82] + - [1048, 5186.82] - - [512, 100, 1, 1024] - - [1037, 2742.19] + - [1099, 2742.19] - - [16, 100, 1, 2000] - - [896, 168.876] + - [958, 168.876] - - [500, 512, 1, 2000] - - [1033, 7289.39] + - [1095, 7289.39] - - [8, 2000, 1, 2048] - - [907, 668.289] + - [969, 668.289] - - [256, 2048, 1, 100] - - [968, 3924.41] + - [1030, 3924.41] - - [32, 2048, 1, 2000] - - [900, 3882.56] + - [962, 3882.56] - - [200, 500, 1, 512] - - [1025, 3368.52] + - [1087, 3368.52] - - [10, 512, 1, 100] - - [885, 91.5286] + - [947, 91.5286] - - [16, 2000, 1, 10] - - [863, 61.6385] + - [925, 61.6385] - - [8, 512, 1, 100] - - [885, 72.2127] + - [947, 72.2127] - - [256, 512, 1, 512] - - [990, 4584.04] + - [1052, 4584.04] - - [500, 2000, 1, 1024] - - [965, 7569.59] + - [1027, 7569.59] - - [512, 512, 1, 500] - - [981, 5708.81] + - [1043, 5708.81] - - [256, 2048, 1, 1024] - - [1005, 5923.21] + - [1067, 5923.21] - - [8, 2048, 1, 2000] - - [886, 1153.9] + - [948, 1153.9] - - [100, 512, 1, 2048] - - [952, 2383.23] + - [1014, 2383.23] - - [100, 1024, 1, 512] - - [1037, 3343.77] + - [1099, 3343.77] - - [128, 100, 1, 2000] - - [1055, 1084.85] + - [1117, 1084.85] - - [4, 2048, 1, 2048] - - [904, 332.454] + - [966, 332.454] - - [2, 1024, 1, 2000] - - [915, 161.106] + - [977, 161.106] - - [100, 512, 1, 512] - - [889, 2184.63] + - [951, 2184.63] - - [128, 1024, 1, 1024] - - [1032, 3848.09] + - [1094, 3848.09] - - [200, 2048, 1, 1024] - - [967, 4547.26] + - [1029, 4547.26] - - [32, 1024, 1, 2000] - - [896, 2416.62] + - [958, 2416.62] - - [128, 500, 1, 100] - - [891, 919.64] + - [953, 919.64] - - [200, 512, 1, 2000] - - [1059, 4238.51] + - [1121, 4238.51] - - [10, 2048, 1, 2000] - - [896, 1454.65] + - [958, 1454.65] - - [256, 1024, 1, 500] - - [993, 5669.3] + - [1055, 5669.3] - - [100, 100, 1, 100] - - [885, 171.333] + - [947, 171.333] - - [8, 512, 1, 1024] - - [954, 286.596] + - [1016, 286.596] - - [200, 1024, 1, 512] - - [1010, 4354.65] + - [1072, 4354.65] - - [256, 500, 1, 500] - - [995, 4020.2] + - [1057, 4020.2] - - [200, 100, 1, 500] - - [1059, 702.347] + - [1121, 702.347] - - [2, 1024, 1, 2048] - - [905, 112.85] + - [967, 112.85] - - [256, 500, 1, 2048] - - [995, 5041.33] + - [1057, 5041.33] - - [512, 2048, 1, 500] - - [988, 7710.22] + - [1050, 7710.22] - - [512, 100, 1, 2000] - - [1037, 3099.37] + - [1099, 3099.37] - - [512, 500, 1, 1024] - - [996, 6463.22] + - [1058, 6463.22] - - [16, 512, 1, 2000] - - [912, 721.227] + - [974, 721.227] - - [64, 500, 1, 1024] - - [951, 1528.46] + - [1013, 1528.46] - - [512, 2000, 1, 10] - - [972, 1174.41] + - [1034, 1174.41] - - [256, 512, 1, 1024] - - [990, 4978.5] + - [1052, 4978.5] - - [10, 512, 1, 1024] - - [950, 370.36] + - [1012, 370.36] - - [512, 100, 1, 100] - - [973, 659.894] + - [1035, 659.894] - - [8, 2000, 1, 100] - - [885, 256.51] + - [947, 256.51] - - [128, 2048, 1, 1024] - - [998, 4173.54] + - [1060, 4173.54] - - [2, 2000, 1, 2000] - - [886, 250.727] + - [948, 250.727] - - [16, 2048, 1, 1024] - - [933, 1046.06] + - [995, 1046.06] - - [500, 512, 1, 500] - - [978, 5517.34] + - [1040, 5517.34] - - [8, 100, 1, 1024] - - [951, 64.1] + - [1013, 64.1] - - [10, 100, 1, 100] - - [875, 17.9571] + - [937, 17.9571] - - [200, 500, 1, 500] - - [1025, 3140.8] + - [1087, 3140.8] - - [10, 500, 1, 2000] - - [912, 444.94] + - [974, 444.94] - - [500, 100, 1, 2000] - - [1040, 2969.22] + - [1102, 2969.22] - - [100, 512, 1, 2000] - - [952, 2776.67] + - [1014, 2776.67] - - [500, 1024, 1, 2000] - - [1031, 8020.15] + - [1093, 8020.15] - - [32, 2000, 1, 2000] - - [892, 3827.85] + - [954, 3827.85] - - [64, 1024, 1, 512] - - [1056, 2573.29] + - [1118, 2573.29] - - [64, 2000, 1, 2000] - - [1025, 5797.2] + - [1087, 5797.2] - - [32, 500, 1, 100] - - [885, 266.767] + - [947, 266.767] - - [128, 2000, 1, 2048] - - [1041, 4548.05] + - [1103, 4548.05] - - [10, 100, 1, 2048] - - [950, 98.5615] + - [1012, 98.5615] - - [32, 2048, 1, 2048] - - [913, 2213.45] + - [975, 2213.45] - - [64, 100, 1, 100] - - [886, 96.4855] + - [948, 96.4855] - - [2, 1024, 1, 100] - - [936, 34.6946] + - [998, 34.6946] - - [256, 1024, 1, 10] - - [1006, 425.658] + - [1068, 425.658] - - [256, 1024, 1, 1024] - - [999, 5482.85] + - [1061, 5482.85] - - [64, 500, 1, 2000] - - [886, 2056.66] + - [948, 2056.66] - - [512, 2000, 1, 512] - - [984, 7550.33] + - [1046, 7550.33] - - [8, 512, 1, 512] - - [893, 232.086] + - [955, 232.086] - - [8, 512, 1, 2048] - - [886, 290.564] + - [948, 290.564] - - [100, 100, 1, 1024] - - [1056, 624.49] + - [1118, 624.49] - - [2, 2048, 1, 10] - - [929, 8.92759] + - [991, 8.92759] - - [4, 2048, 1, 512] - - [935, 312.176] + - [997, 312.176] - - [4, 2048, 1, 10] - - [928, 18.0649] + - [990, 18.0649] - - [8, 100, 1, 2000] - - [905, 85.9369] + - [967, 85.9369] - - [2, 1024, 1, 1024] - - [902, 101.314] + - [964, 101.314] - - [16, 2048, 1, 100] - - [886, 518.581] + - [948, 518.581] - - [16, 512, 1, 512] - - [896, 456.003] + - [958, 456.003] - - [32, 500, 1, 512] - - [893, 906.295] + - [955, 906.295] - - [500, 2000, 1, 2000] - - [988, 8143.42] + - [1050, 8143.42] - - [500, 1024, 1, 10] - - [969, 680.951] + - [1031, 680.951] - - [32, 500, 1, 1024] - - [945, 1008.97] + - [1007, 1008.97] - - [32, 500, 1, 10] - - [881, 33.4333] + - [943, 33.4333] - - [500, 500, 1, 10] - - [1010, 367.747] + - [1072, 367.747] - - [4, 2000, 1, 500] - - [896, 370.47] + - [958, 370.47] - - [10, 2000, 1, 500] - - [886, 899.381] + - [948, 899.381] - - [32, 2000, 1, 512] - - [898, 2089.9] + - [960, 2089.9] - - [256, 500, 1, 100] - - [1011, 1495.43] + - [1073, 1495.43] - - [256, 2048, 1, 10] - - [969, 789.69] + - [1031, 789.69] - - [4, 1024, 1, 500] - - [886, 222.709] + - [948, 222.709] - - [256, 512, 1, 2048] - - [995, 5292.6] + - [1057, 5292.6] - - [2, 2000, 1, 1024] - - [933, 137.365] + - [995, 137.365] - - [256, 100, 1, 512] - - [1037, 1085.13] + - [1099, 1085.13] - - [8, 1024, 1, 500] - - [886, 441.479] + - [948, 441.479] - - [256, 2048, 1, 500] - - [1016, 7031.86] + - [1078, 7031.86] - - [256, 2048, 1, 2048] - - [979, 6771.93] + - [1041, 6771.93] - - [2, 2000, 1, 512] - - [940, 159.106] + - [1002, 159.106] - - [256, 2000, 1, 512] - - [983, 6527.59] + - [1045, 6527.59] - - [4, 1024, 1, 100] - - [932, 70.237] + - [994, 70.237] - - [512, 1024, 1, 2000] - - [1017, 8295.8] + - [1079, 8295.8] - - [100, 500, 1, 500] - - [889, 2016.23] + - [951, 2016.23] - - [4, 2048, 1, 1024] - - [937, 285.039] + - [999, 285.039] - - [2, 1024, 1, 500] - - [886, 109.502] + - [948, 109.502] - - [64, 100, 1, 500] - - [886, 296.396] + - [948, 296.396] - - [256, 2000, 1, 2000] - - [994, 8152.97] + - [1056, 8152.97] - - [2, 512, 1, 500] - - [892, 44.8552] + - [954, 44.8552] - - [8, 2048, 1, 500] - - [886, 736.791] + - [948, 736.791] - - [10, 1024, 1, 500] - - [886, 547.109] + - [948, 547.109] - - [4, 2048, 1, 2000] - - [896, 604.23] + - [958, 604.23] - - [200, 1024, 1, 2000] - - [1063, 5400.94] + - [1125, 5400.94] - - [128, 500, 1, 512] - - [1056, 2730.77] + - [1118, 2730.77] - - [10, 500, 1, 2048] - - [950, 359.651] + - [1012, 359.651] - - [256, 2048, 1, 2000] - - [994, 8375.31] + - [1056, 8375.31] - - [8, 2000, 1, 2000] - - [896, 1146.23] + - [958, 1146.23] - - [100, 2048, 1, 512] - - [1019, 3936.2] + - [1081, 3936.2] - - [512, 500, 1, 2048] - - [995, 6756.39] + - [1057, 6756.39] - - [200, 2048, 1, 100] - - [986, 3180.22] + - [1048, 3180.22] - - [128, 512, 1, 512] - - [889, 2872.91] + - [951, 2872.91] - - [200, 2000, 1, 2048] - - [1035, 4818.92] + - [1097, 4818.92] - - [4, 2000, 1, 1024] - - [933, 275.369] + - [995, 275.369] - - [64, 512, 1, 10] - - [1048, 69.5237] + - [1110, 69.5237] - - [32, 500, 1, 2000] - - [915, 1246.21] + - [977, 1246.21] - - [128, 2048, 1, 2000] - - [1028, 7233.65] + - [1090, 7233.65] - - [100, 100, 1, 2048] - - [886, 790.223] + - [948, 790.223] - - [500, 2048, 1, 512] - - [1016, 7249.66] + - [1078, 7249.66] - - [200, 100, 1, 512] - - [892, 748.638] + - [954, 748.638] - - [32, 2000, 1, 100] - - [887, 930.333] + - [949, 930.333] - - [500, 512, 1, 2048] - - [1038, 6640.02] + - [1100, 6640.02] - - [500, 2000, 1, 500] - - [1018, 7078.24] + - [1080, 7078.24] - - [200, 100, 1, 2048] - - [896, 1387.63] + - [958, 1387.63] - - [2, 2048, 1, 100] - - [930, 64.9101] + - [992, 64.9101] - - [8, 100, 1, 10] - - [871, 1.85439] + - [933, 1.85439] - - [200, 2048, 1, 2048] - - [1035, 5022.02] + - [1097, 5022.02] - - [200, 2048, 1, 500] - - [986, 5355.75] + - [1048, 5355.75] - - [100, 100, 1, 500] - - [1056, 416.767] + - [1118, 416.767] - - [8, 2048, 1, 10] - - [927, 34.8119] + - [989, 34.8119] - - [100, 500, 1, 10] - - [867, 93.3836] + - [929, 93.3836] - - [200, 500, 1, 2000] - - [1059, 4152.92] + - [1121, 4152.92] - - [512, 2000, 1, 500] - - [988, 7485.48] + - [1050, 7485.48] - - [10, 500, 1, 1024] - - [954, 363.736] + - [1016, 363.736] - - [256, 100, 1, 10] - - [1003, 41.1256] + - [1065, 41.1256] - - [500, 512, 1, 1024] - - [982, 6362.82] + - [1044, 6362.82] - - [200, 2048, 1, 2000] - - [1024, 6321.09] + - [1086, 6321.09] - - [100, 1024, 1, 100] - - [1023, 1306.22] + - [1085, 1306.22] - - [500, 1024, 1, 100] - - [962, 3699.52] + - [1024, 3699.52] - - [10, 512, 1, 2048] - - [886, 361.18] + - [948, 361.18] - - [2, 1024, 1, 512] - - [935, 105.803] + - [997, 105.803] - - [4, 500, 1, 2048] - - [958, 143.517] + - [1020, 143.517] - - [100, 512, 1, 100] - - [891, 744.286] + - [953, 744.286] - - [16, 500, 1, 512] - - [886, 453.197] + - [948, 453.197] - - [10, 1024, 1, 100] - - [884, 166.334] + - [946, 166.334] - - [8, 1024, 1, 100] - - [932, 140.374] + - [994, 140.374] - - [64, 2000, 1, 500] - - [1027, 3940.99] + - [1089, 3940.99] - - [64, 1024, 1, 2000] - - [892, 3531.13] + - [954, 3531.13] - - [10, 100, 1, 512] - - [886, 61.6385] + - [948, 61.6385] - - [4, 500, 1, 2000] - - [912, 173.11] + - [974, 173.11] - - [512, 1024, 1, 10] - - [963, 736.46] + - [1025, 736.46] - - [128, 2048, 1, 2048] - - [1026, 4596.6] + - [1088, 4596.6] - - [4, 100, 1, 100] - - [875, 7.24286] + - [937, 7.24286] - - [32, 1024, 1, 512] - - [935, 1519.78] + - [997, 1519.78] - - [8, 512, 1, 2000] - - [960, 356.894] + - [1022, 356.894] - - [100, 100, 1, 512] - - [900, 426.767] + - [962, 426.767] - - [2, 2048, 1, 2048] - - [909, 170.878] + - [971, 170.878] - - [2, 512, 1, 2000] - - [912, 90.8801] + - [974, 90.8801] - - [16, 500, 1, 10] - - [885, 18.2818] + - [947, 18.2818] - - [10, 500, 1, 100] - - [885, 88.1282] + - [947, 88.1282] - - [4, 100, 1, 500] - - [950, 23.6849] + - [1012, 23.6849] - - [512, 1024, 1, 1024] - - [1002, 7431.87] + - [1064, 7431.87] - - [64, 500, 1, 100] - - [895, 506.429] + - [957, 506.429] - - [128, 2000, 1, 10] - - [1053, 432.532] + - [1115, 432.532] - - [10, 2000, 1, 2048] - - [916, 806.399] + - [978, 806.399] - - [2, 100, 1, 100] - - [873, 3.225] + - [935, 3.225] - - [10, 512, 1, 2000] - - [905, 462.194] + - [967, 462.194] - - [8, 500, 1, 500] - - [886, 231.581] + - [948, 231.581] - - [4, 500, 1, 512] - - [886, 118.619] + - [948, 118.619] - - [10, 500, 1, 10] - - [880, 11.0649] + - [942, 11.0649] - - [64, 512, 1, 2000] - - [886, 2116.9] + - [948, 2116.9] - - [500, 512, 1, 10] - - [1007, 395.162] + - [1069, 395.162] - - [200, 512, 1, 512] - - [1025, 3449.36] + - [1087, 3449.36] - - [512, 500, 1, 500] - - [981, 5536.43] + - [1043, 5536.43] - - [32, 512, 1, 2000] - - [896, 1264.3] + - [958, 1264.3] - - [128, 500, 1, 2048] - - [952, 3006.34] + - [1014, 3006.34] - - [500, 2048, 1, 10] - - [977, 1049.28] + - [1039, 1049.28] - - [512, 512, 1, 100] - - [1014, 2664.16] + - [1076, 2664.16] - - [200, 2000, 1, 512] - - [1022, 5192.8] + - [1084, 5192.8] - - [500, 500, 1, 512] - - [978, 5673.86] + - [1040, 5673.86] - - [128, 2048, 1, 500] - - [1010, 5251.38] + - [1072, 5251.38] - - [4, 512, 1, 512] - - [886, 123.753] + - [948, 123.753] - - [16, 2048, 1, 2000] - - [902, 2294.78] + - [964, 2294.78] - - [16, 500, 1, 1024] - - [886, 562.737] + - [948, 562.737] - - [256, 2000, 1, 500] - - [1016, 6639.1] + - [1078, 6639.1] - - [10, 1024, 1, 10] - - [866, 21.0836] + - [928, 21.0836] - - [16, 500, 1, 500] - - [886, 446.529] + - [948, 446.529] - - [10, 2048, 1, 512] - - [884, 784.962] + - [946, 784.962] - - [200, 500, 1, 10] - - [859, 176.156] + - [921, 176.156] - - [256, 2048, 1, 512] - - [1013, 6540.93] + - [1075, 6540.93] - - [256, 2000, 1, 2048] - - [990, 6670.43] + - [1052, 6670.43] - - [500, 2048, 1, 500] - - [1018, 7264.57] + - [1080, 7264.57] - - [500, 100, 1, 1024] - - [1040, 2700.52] + - [1102, 2700.52] - - [16, 100, 1, 512] - - [950, 96.7038] + - [1012, 96.7038] - - [64, 512, 1, 2048] - - [951, 1868.39] + - [1013, 1868.39] - - [32, 1024, 1, 10] - - [862, 69.5237] + - [924, 69.5237] - - [16, 2048, 1, 512] - - [935, 1226.5] + - [997, 1226.5] - - [8, 1024, 1, 512] - - [935, 416.202] + - [997, 416.202] - - [4, 1024, 1, 2048] - - [957, 223.201] + - [1019, 223.201] - - [100, 2048, 1, 2000] - - [1030, 5614.14] + - [1092, 5614.14] - - [512, 512, 1, 2048] - - [995, 6868.97] + - [1057, 6868.97] - - [256, 2000, 1, 1024] - - [986, 5758.98] + - [1048, 5758.98] - - [64, 512, 1, 512] - - [1055, 1651.4] + - [1117, 1651.4] - - [200, 1024, 1, 10] - - [869, 341.433] + - [931, 341.433] - - [128, 500, 1, 500] - - [898, 2580.75] + - [960, 2580.75] - - [100, 512, 1, 1024] - - [889, 2041.72] + - [951, 2041.72] - - [16, 1024, 1, 500] - - [886, 867.897] + - [948, 867.897] - - [128, 100, 1, 2048] - - [1056, 1011.46] + - [1118, 1011.46] - - [100, 512, 1, 500] - - [889, 2051.38] + - [951, 2051.38] - - [8, 1024, 1, 1024] - - [902, 424.625] + - [964, 424.625] - - [2, 2000, 1, 10] - - [928, 8.57458] + - [990, 8.57458] - - [4, 500, 1, 10] - - [925, 4.56429] + - [987, 4.56429] - - [500, 2000, 1, 2048] - - [1002, 7444.12] + - [1064, 7444.12] - - [4, 2000, 1, 100] - - [938, 128.305] + - [1000, 128.305] - - [512, 2000, 1, 2000] - - [988, 8454.53] + - [1050, 8454.53] - - [128, 500, 1, 10] - - [1047, 117.747] + - [1109, 117.747] - - [32, 1024, 1, 100] - - [895, 512.1] + - [957, 512.1] - - [8, 500, 1, 2048] - - [910, 286.935] + - [972, 286.935] - - [16, 1024, 1, 1024] - - [874, 881.256] + - [936, 881.256] - - [200, 100, 1, 10] - - [1046, 40.4226] + - [1108, 40.4226] - - [512, 100, 1, 500] - - [1040, 1987.68] + - [1102, 1987.68] - - [512, 2048, 1, 2048] - - [997, 8063.65] + - [1059, 8063.65] - - [16, 2000, 1, 512] - - [896, 1204.81] + - [958, 1204.81] - - [64, 2048, 1, 1024] - - [894, 2853.37] + - [956, 2853.37] - - [32, 2048, 1, 10] - - [868, 130.132] + - [930, 130.132] - - [10, 2048, 1, 10] - - [870, 39.4846] + - [932, 39.4846] - - [4, 2000, 1, 512] - - [886, 316.149] + - [948, 316.149] - - [4, 500, 1, 100] - - [885, 35.8143] + - [947, 35.8143] - - [8, 100, 1, 2048] - - [905, 84.7281] + - [967, 84.7281] - - [512, 2048, 1, 10] - - [985, 1225.07] + - [1047, 1225.07] - - [512, 100, 1, 10] - - [974, 90.2408] + - [1036, 90.2408] - - [4, 512, 1, 1024] - - [886, 143.348] + - [948, 143.348] - - [16, 2048, 1, 10] - - [919, 65.1159] + - [981, 65.1159] - - [500, 2000, 1, 100] - - [970, 4717.08] + - [1032, 4717.08] - - [32, 1024, 1, 2048] - - [913, 1582.86] + - [975, 1582.86] - - [100, 2000, 1, 2000] - - [1030, 5512.78] + - [1092, 5512.78] - - [128, 100, 1, 512] - - [1056, 561.196] + - [1118, 561.196] - - [500, 500, 1, 100] - - [1010, 2460.73] + - [1072, 2460.73] - - [32, 2000, 1, 10] - - [862, 119.503] + - [924, 119.503] - - [128, 2048, 1, 100] - - [1010, 2708.2] + - [1072, 2708.2] - - [10, 2000, 1, 100] - - [885, 316.556] + - [947, 316.556] - - [2, 2048, 1, 500] - - [896, 191.145] + - [958, 191.145] - - [32, 1024, 1, 500] - - [896, 1563.46] + - [958, 1563.46] - - [4, 1024, 1, 10] - - [925, 9.24286] + - [987, 9.24286] - - [100, 512, 1, 10] - - [1051, 97.0697] + - [1113, 97.0697] - - [8, 100, 1, 100] - - [901, 14.3857] + - [963, 14.3857] - - [128, 512, 1, 500] - - [889, 2677.22] + - [951, 2677.22] - - [16, 100, 1, 2048] - - [912, 161.997] + - [974, 161.997] - - [2, 1024, 1, 10] - - [925, 4.59123] + - [987, 4.59123] - - [4, 100, 1, 2048] - - [905, 41.8959] + - [967, 41.8959] - - [4, 512, 1, 2000] - - [905, 180.382] + - [967, 180.382] - - [4096, 64, 1, 2048] - - [1105, 7247.28] + - [1167, 7247.28] - - [1024, 10080, 1, 1024] - - [1093, 9833.47] + - [1155, 9833.47] - - [1024, 1131, 1, 1024] - - [1071, 7551.95] + - [1133, 7551.95] - - [36548, 1216, 1, 1024] - - [1083, 10351.6] + - [1145, 10351.6] - - [1024, 29, 1, 1024] - - [1115, 1697.01] + - [1177, 1697.01] - - [1024, 2592, 1, 1024] - - [1084, 8424.11] + - [1146, 8424.11] - - [1024, 1568, 1, 1024] - - [1095, 7511.86] + - [1157, 7511.86] - - [4096, 91, 1, 2048] - - [1064, 5599.91] + - [1126, 5599.91] - - [1024, 4445, 1, 1024] - - [1082, 9261.22] + - [1144, 9261.22] - - [1024, 6272, 1, 1024] - - [1077, 9439.61] + - [1139, 9439.61] - - [36548, 3584, 1, 1024] - - [1076, 10393.8] + - [1138, 10393.8] - - [1024, 1827, 1, 1024] - - [1095, 8714.42] + - [1157, 8714.42] - - [1024, 3220, 1, 1024] - - [1075, 8861.2] + - [1137, 8861.2] - - [1024, 1856, 1, 1024] - - [1092, 8827.05] + - [1154, 8827.05] - - [1024, 1760, 1, 1024] - - [1092, 8334.2] + - [1154, 8334.2] - - [1024, 1600, 1, 1024] - - [1092, 7615.07] + - [1154, 7615.07] - - [1024, 1, 1, 21] - - [1096, 0.1] + - [1158, 0.1] - - [36548, 4235, 1, 1024] - - [1076, 10276.8] + - [1138, 10276.8] - - [1024, 49, 1, 1024] - - [1111, 2643.12] + - [1173, 2643.12] - - [1024, 1984, 1, 1024] - - [1095, 9449.52] + - [1157, 9449.52] - - [1024, 14720, 1, 1024] - - [1082, 10033.3] + - [1144, 10033.3] - - [1024, 1152, 1, 1024] - - [1065, 7523.54] + - [1127, 7523.54] - - [36548, 14976, 1, 1024] - - [1083, 10421.7] + - [1145, 10421.7] - - [36548, 1152, 1, 1024] - - [1083, 10258.1] + - [1145, 10258.1] - - [4096, 86, 1, 3072] - - [1064, 5308.85] + - [1126, 5308.85] - - [1024, 3392, 1, 1024] - - [1077, 9176.54] + - [1139, 9176.54] - - [1024, 1408, 1, 1024] - - [1077, 8958.83] + - [1139, 8958.83] - - [1024, 2080, 1, 1024] - - [1068, 8396.49] + - [1130, 8396.49] - - [1024, 1824, 1, 1024] - - [1086, 8671.71] + - [1148, 8671.71] - - [36548, 2432, 1, 1024] - - [1076, 10392.6] + - [1138, 10392.6] - - [4096, 29, 1, 2048] - - [1097, 4325.66] + - [1159, 4325.66] - - [1024, 1102, 1, 1024] - - [1071, 7204.18] + - [1133, 7204.18] - - [4096, 49, 1, 2048] - - [1103, 5609.29] + - [1165, 5609.29] - - [36548, 1827, 1, 1024] - - [1083, 10183.2] + - [1145, 10183.2] - - [4096, 25, 1, 2048] - - [1098, 3788.31] + - [1160, 3788.31] - - [1024, 10176, 1, 1024] - - [1093, 9941.18] + - [1155, 9941.18] - - [1024, 774, 1, 1024] - - [1078, 7079.67] + - [1140, 7079.67] - - [1024, 1952, 1, 1024] - - [1095, 9300.49] + - [1157, 9300.49] - - [4096, 128, 1, 2048] - - [1065, 8274.96] + - [1127, 8274.96] - - [1024, 17024, 1, 1024] - - [1075, 9960.72] + - [1137, 9960.72] - - [1024, 1472, 1, 1024] - - [1084, 9343.37] + - [1146, 9343.37] - - [36548, 4459, 1, 1024] - - [1076, 10358.1] + - [1138, 10358.1] - - [4096, 91, 1, 3072] - - [1070, 5509.39] + - [1132, 5509.39] - - [1024, 3712, 1, 1024] - - [1084, 9048.66] + - [1146, 9048.66] - - [4096, 64, 1, 3072] - - [1117, 7489.93] + - [1179, 7489.93] - - [4096, 29, 1, 3072] - - [1097, 4511.78] + - [1159, 4511.78] - - [4096, 128, 1, 3072] - - [1064, 8423.83] + - [1126, 8423.83] - - [36548, 12928, 1, 1024] - - [1083, 10426.1] + - [1145, 10426.1] - - [1024, 1632, 1, 1024] - - [1065, 7761.73] + - [1127, 7761.73] - - [1024, 1696, 1, 1024] - - [1090, 8107.29] + - [1152, 8107.29] - - [4096, 24, 1, 2048] - - [1097, 3663.25] + - [1159, 3663.25] - - [4096, 63, 1, 3072] - - [1106, 7175.37] + - [1168, 7175.37] - - [4096, 96, 1, 2048] - - [1065, 5866.28] + - [1127, 5866.28] - - [36548, 1764, 1, 1024] - - [1076, 10128.5] + - [1138, 10128.5] - - [4096, 32, 1, 2048] - - [1101, 4540.62] + - [1163, 4540.62] - - [1024, 35, 1, 1024] - - [1109, 1911.57] + - [1171, 1911.57] - - [1024, 1120, 1, 1024] - - [1064, 7289.13] + - [1126, 7289.13] - - [4096, 49, 1, 3072] - - [1103, 5751.62] + - [1165, 5751.62] - - [1024, 24, 1, 1024] - - [1109, 1392.02] + - [1171, 1392.02] - - [1024, 2944, 1, 1024] - - [1085, 9284.93] + - [1147, 9284.93] - - [36548, 14080, 1, 1024] - - [1076, 10441.4] + - [1138, 10441.4] - - [1024, 1, 1, 1024] - - [1096, 0.1] + - [1158, 0.1] - - [1024, 1280, 1, 1024] - - [1064, 8244.46] + - [1126, 8244.46] - - [1024, 13440, 1, 1024] - - [1076, 9799.92] + - [1138, 9799.92] - - [1024, 1015, 1, 1024] - - [1084, 9187.85] + - [1146, 9187.85] - - [36548, 9120, 1, 1024] - - [1076, 10400.0] + - [1138, 10400.0] - - [36548, 1, 1, 1024] - - [1096, 0.1] + - [1158, 0.1] - - [1024, 3008, 1, 1024] - - [1085, 9468.55] + - [1147, 9468.55] - - [1024, 2560, 1, 1024] - - [1082, 8879.31] + - [1144, 8879.31] - - [1024, 21, 1, 1024] - - [1108, 1234.41] + - [1170, 1234.41] - - [1024, 2208, 1, 1024] - - [1064, 8231.27] + - [1126, 8231.27] - - [1024, 96, 1, 1024] - - [1114, 3767.44] + - [1176, 3767.44] - - [4096, 86, 1, 2048] - - [1065, 5529.09] + - [1127, 5529.09] - - [4096, 96, 1, 3072] - - [1064, 6273.28] + - [1126, 6273.28] - - [1024, 1920, 1, 1024] - - [1094, 9118.19] + - [1156, 9118.19] - - [4096, 27, 1, 2048] - - [1097, 4073.7] + - [1159, 4073.7] - - [36548, 2496, 1, 1024] - - [1076, 10361.2] + - [1138, 10361.2] - - [1024, 1, 1, 14] - - [1096, 0.1] + - [1158, 0.1] - - [1024, 91, 1, 1024] - - [1116, 3647.67] + - [1178, 3647.67] - - [1024, 2016, 1, 1024] - - [1092, 9560.24] + - [1154, 9560.24] - - [1024, 1184, 1, 1024] - - [1065, 7678.96] + - [1127, 7678.96] - - [4096, 1, 1, 2048] - - [1096, 0.1] + - [1158, 0.1] - - [1024, 1664, 1, 1024] - - [1090, 7934.07] + - [1152, 7934.07] - - [1024, 11424, 1, 1024] - - [1082, 9777.91] + - [1144, 9777.91] - - [4096, 24, 1, 3072] - - [1100, 3813.1] + - [1162, 3813.1] - - [1024, 1216, 1, 1024] - - [1064, 7902.13] + - [1126, 7902.13] - - [36548, 3185, 1, 1024] - - [1076, 10336.7] + - [1138, 10336.7] - - [36548, 9216, 1, 1024] - - [1076, 10414.3] + - [1138, 10414.3] - - [1024, 3200, 1, 1024] - - [1082, 8847.01] + - [1144, 8847.01] - - [1024, 2656, 1, 1024] - - [1077, 8649.25] + - [1139, 8649.25] - - [1024, 2368, 1, 1024] - - [1077, 8873.16] + - [1139, 8873.16] - - [1024, 4459, 1, 1024] - - [1084, 9431.32] + - [1146, 9431.32] - - [1024, 3808, 1, 1024] - - [1084, 9263.72] + - [1146, 9263.72] - - [1024, 2336, 1, 1024] - - [1077, 8966.0] + - [1139, 8966.0] - - [4096, 27, 1, 3072] - - [1097, 4171.74] + - [1159, 4171.74] - - [1024, 2304, 1, 1024] - - [1074, 8601.38] + - [1136, 8601.38] - - [1024, 1560, 1, 1024] - - [1089, 7481.74] + - [1151, 7481.74] - - [4096, 35, 1, 3072] - - [1103, 4176.9] + - [1165, 4176.9] - - [1024, 2496, 1, 1024] - - [1080, 9092.86] + - [1142, 9092.86] - - [1024, 1504, 1, 1024] - - [1080, 9220.53] + - [1142, 9220.53] - - [4096, 50, 1, 2048] - - [1104, 5472.83] + - [1166, 5472.83] - - [1024, 3232, 1, 1024] - - [1077, 8961.94] + - [1139, 8961.94] - - [1024, 14, 1, 1024] - - [1108, 882.315] + - [1170, 882.315] - - [36548, 1015, 1, 1024] - - [1076, 10140.9] + - [1138, 10140.9] - - [1024, 2000, 1, 1024] - - [1088, 9487.8] + - [1150, 9487.8] - - [36548, 243, 1, 1024] - - [1081, 9441.12] + - [1143, 9441.12] - - [36548, 32, 1, 1024] - - [1069, 4721.05] + - [1131, 4721.05] - - [1024, 25, 1, 1024] - - [1115, 1462.96] + - [1177, 1462.96] - - [1024, 13184, 1, 1024] - - [1079, 9866.28] + - [1141, 9866.28] - - [1024, 2688, 1, 1024] - - [1074, 8559.93] + - [1136, 8559.93] - - [1024, 27, 1, 1024] - - [1113, 1559.11] + - [1175, 1559.11] - - [36548, 950, 1, 1024] - - [1083, 10053.6] + - [1145, 10053.6] - - [1024, 1764, 1, 1024] - - [1090, 8347.11] + - [1152, 8347.11] - - [1024, 992, 1, 1024] - - [1077, 9035.82] + - [1139, 9035.82] - - [1024, 1376, 1, 1024] - - [1077, 8797.96] + - [1139, 8797.96] - - [1024, 950, 1, 1024] - - [1084, 8635.26] + - [1146, 8635.26] - - [36548, 774, 1, 1024] - - [1076, 9460.82] + - [1138, 9460.82] - - [36548, 25, 1, 1024] - - [1069, 3694.16] + - [1131, 3694.16] - - [1024, 4256, 1, 1024] - - [1077, 9172.16] + - [1139, 9172.16] - - [4096, 32, 1, 3072] - - [1098, 4886.67] + - [1160, 4886.67] - - [1024, 243, 1, 1024] - - [1102, 6594.41] + - [1164, 6594.41] - - [36548, 3712, 1, 1024] - - [1076, 10401.6] + - [1138, 10401.6] - - [1024, 50, 1, 1024] - - [1111, 2742.19] + - [1173, 2742.19] - - [1024, 3360, 1, 1024] - - [1073, 9017.37] + - [1135, 9017.37] - - [1024, 2048, 1, 1024] - - [1088, 9736.65] + - [1150, 9736.65] - - [1024, 2784, 1, 1024] - - [1084, 8835.6] + - [1146, 8835.6] - - [1024, 4992, 1, 1024] - - [1082, 9639.38] + - [1144, 9639.38] - - [36548, 1102, 1, 1024] - - [1083, 9859.04] + - [1145, 9859.04] - - [1024, 1536, 1, 1024] - - [1075, 9294.98] + - [1137, 9294.98] - - [1024, 2720, 1, 1024] - - [1080, 8617.88] + - [1142, 8617.88] - - [4096, 1, 1, 3072] - - [1096, 0.1] + - [1158, 0.1] - - [1024, 2752, 1, 1024] - - [1084, 8902.17] + - [1146, 8902.17] - - [1024, 2816, 1, 1024] - - [1082, 8906.95] + - [1144, 8906.95] - - [1024, 2624, 1, 1024] - - [1084, 8494.41] + - [1146, 8494.41] - - [1024, 2144, 1, 1024] - - [1067, 8243.56] + - [1129, 8243.56] - - [36548, 1131, 1, 1024] - - [1083, 10104.6] + - [1145, 10104.6] - - [4096, 25, 1, 3072] - - [1098, 3959.98] + - [1160, 3959.98] - - [1024, 64, 1, 1024] - - [1111, 3410.1] + - [1173, 3410.1] - - [1024, 3296, 1, 1024] - - [1082, 9066.52] + - [1144, 9066.52] - - [36548, 4992, 1, 1024] - - [1076, 10395.6] + - [1138, 10395.6] - - [1024, 1344, 1, 1024] - - [1077, 8522.66] + - [1139, 8522.66] - - [36548, 2401, 1, 1024] - - [1076, 10250.3] + - [1138, 10250.3] - - [1024, 15744, 1, 1024] - - [1076, 10006.4] + - [1138, 10006.4] - - [1024, 15232, 1, 1024] - - [1075, 9912.21] + - [1137, 9912.21] - - [1024, 1888, 1, 1024] - - [1087, 8962.98] + - [1149, 8962.98] - - [1024, 1792, 1, 1024] - - [1091, 8556.82] + - [1153, 8556.82] - - [36548, 1073, 1, 1024] - - [1076, 10161.2] + - [1138, 10161.2] - - [4096, 50, 1, 3072] - - [1103, 5882.16] + - [1165, 5882.16] - - [36548, 15488, 1, 1024] - - [1083, 10437.1] + - [1145, 10437.1] - - [1024, 2464, 1, 1024] - - [1080, 8880.02] + - [1142, 8880.02] - - [1024, 2272, 1, 1024] - - [1077, 8720.35] + - [1139, 8720.35] - - [1024, 13, 1, 1024] - - [1107, 774.616] + - [1169, 774.616] - - [1024, 2432, 1, 1024] - - [1082, 8491.53] + - [1144, 8491.53] - - [36548, 24, 1, 1024] - - [1069, 3564.41] + - [1131, 3564.41] - - [1024, 3936, 1, 1024] - - [1092, 9433.3] + - [1154, 9433.3] - - [36548, 13824, 1, 1024] - - [1076, 10439.8] + - [1138, 10439.8] - - [1024, 2401, 1, 1024] - - [1084, 8870.03] + - [1146, 8870.03] - - [1024, 32, 1, 1024] - - [1099, 1839.71] + - [1161, 1839.71] - - [1024, 2176, 1, 1024] - - [1068, 8544.55] + - [1130, 8544.55] - - [1024, 2240, 1, 1024] - - [1077, 8381.55] + - [1139, 8381.55] - - [1024, 1728, 1, 1024] - - [1065, 8212.33] + - [1127, 8212.33] - - [1024, 128, 1, 1024] - - [1112, 4660.44] + - [1174, 4660.44] - - [1024, 216, 1, 1024] - - [1102, 5777.97] + - [1164, 5777.97] - - [1024, 63, 1, 1024] - - [1110, 3329.75] + - [1172, 3329.75] - - [1024, 86, 1, 1024] - - [1116, 3533.7] + - [1178, 3533.7] - - [1024, 2528, 1, 1024] - - [1072, 8789.25] + - [1134, 8789.25] - - [1024, 2400, 1, 1024] - - [1077, 8939.4] + - [1139, 8939.4] - - [1024, 1440, 1, 1024] - - [1084, 9131.41] + - [1146, 9131.41] - - [1024, 2912, 1, 1024] - - [1077, 9140.03] + - [1139, 9140.03] - - [4096, 35, 1, 2048] - - [1103, 4059.85] + - [1165, 4059.85] - - [4096, 63, 1, 2048] - - [1105, 6946.5] + - [1167, 6946.5] - - [1024, 2880, 1, 1024] - - [1075, 9104.98] + - [1137, 9104.98] - - [1024, 4064, 1, 1024] - - [1094, 9715.2] + - [1156, 9715.2] - - [1024, 4655, 1, 1024] - - [1082, 9033.9] + - [1144, 9033.9] - - [1024, 1088, 1, 1024] - - [1066, 8144.41] + - [1128, 8144.41] - - [36548, 6272, 1, 1024] - - [1083, 10427.4] + - [1145, 10427.4] - - [1024, 1, 1, 13] - - [1096, 0.1] + - [1158, 0.1] + - - [768, 512, 1, 768] + - [1182, 5889.14] + - - [768, 2048, 1, 3072] + - [1192, 9394.72] + - - [768, 32, 1, 768] + - [1204, 1502.84] + - - [64, 128, 96, 128] + - [1199, 4973.58] + - - [3072, 1024, 1, 768] + - [1193, 9856.17] + - - [768, 1024, 1, 3072] + - [1186, 8611.16] + - - [768, 512, 1, 3072] + - [1185, 6430.89] + - - [768, 64, 1, 768] + - [1206, 2621.54] + - - [768, 4096, 1, 3072] + - [1191, 10030.5] + - - [768, 2048, 1, 2] + - [1184, 381.863] + - - [768, 2048, 1, 768] + - [1189, 9754.3] + - - [768, 320, 1, 30522] + - [1202, 8529.5] + - - [64, 64, 96, 64] + - [1196, 2496.71] + - - [768, 640, 1, 30522] + - [1183, 8253.94] + - - [768, 1280, 1, 30522] + - [1188, 9572.95] + - - [768, 1280, 1, 768] + - [1192, 8714.03] + - - [768, 640, 1, 768] + - [1182, 7293.13] + - - [768, 32, 1, 2] + - [1194, 11.9154] + - - [3072, 2048, 1, 768] + - [1189, 10019.7] + - - [768, 4096, 1, 768] + - [1189, 9927.45] + - - [3072, 4096, 1, 768] + - [1192, 10150.2] + - - [64, 256, 192, 256] + - [1198, 7054.29] + - - [768, 8, 1, 768] + - [1205, 341.039] + - - [64, 128, 384, 128] + - [1197, 6765.11] + - - [768, 1024, 1, 768] + - [1187, 8768.68] + - - [768, 320, 1, 768] + - [1203, 6838.64] + - - [64, 64, 768, 64] + - [1200, 5388.93] + - - [768, 1024, 1, 2] + - [1180, 258.795] + - - [768, 16, 1, 768] + - [1205, 819.3] + - - [64, 256, 96, 256] + - [1198, 5893.74] + - - [3072, 512, 1, 768] + - [1190, 9722.89] + - - [768, 160, 1, 768] + - [1207, 5019.88] + - - [768, 4096, 1, 2] + - [1181, 507.475] + - - [1600, 512, 1, 1024] + - [1211, 7187.05] + - - [1024, 512, 1, 64] + - [1209, 2557.6] + - - [1024, 512, 1, 1] + - [1208, 71.3348] + - - [2048, 512, 1, 1] + - [1210, 90.4945] + - - [1024, 200, 1, 1] + - [1216, 40.1] + - - [32, 200, 1, 1] + - [1212, 1.66863] + - - [560, 200, 1, 1024] + - [1220, 4731.45] + - - [1, 512, 1, 1] + - [1219, 0.230612] + - - [64, 512, 1, 1] + - [1214, 7.68519] + - - [1024, 8192, 1, 256] + - [1229, 9519.09] + - - [1024, 22016, 1, 256] + - [1235, 9881.22] + - - [256, 8976, 1, 4352] + - [1227, 9567.18] + - - [512, 256, 1, 2048] + - [1240, 5917.99] + - - [1024, 19968, 1, 256] + - [1235, 9882.47] + - - [256, 8976, 1, 1536] + - [1225, 8437.45] + - - [256, 8976, 1, 33536] + - [1225, 8441.99] + - - [1024, 1792, 1, 256] + - [1225, 7757.07] + - - [1024, 21504, 1, 256] + - [1235, 9894.0] + - - [512, 215, 1, 2048] + - [1241, 4665.74] + - - [1024, 7168, 1, 256] + - [1229, 9509.45] + - - [256, 8976, 1, 15872] + - [1231, 8914.75] + - - [1024, 19712, 1, 256] + - [1235, 9772.0] + - - [256, 8976, 1, 5632] + - [1231, 8740.13] + - - [1024, 14848, 1, 256] + - [1235, 9756.25] + - - [1024, 28672, 1, 256] + - [1235, 9959.02] + - - [256, 8976, 1, 9728] + - [1238, 8853.14] + - - [1024, 17152, 1, 256] + - [1229, 9737.4] + - - [256, 8976, 1, 11520] + - [1231, 8999.3] + - - [256, 8976, 1, 8192] + - [1221, 7897.42] + - - [1024, 3328, 1, 256] + - [1236, 8593.63] + - - [256, 8976, 1, 7424] + - [1231, 8980.57] + - - [1024, 18944, 1, 256] + - [1235, 9854.95] + - - [1024, 10496, 1, 256] + - [1230, 9454.0] + - - [256, 8976, 1, 5376] + - [1228, 9608.47] + - - [256, 8976, 1, 6144] + - [1225, 7880.23] + - - [1024, 40448, 1, 256] + - [1235, 10016.7] + - - [256, 8976, 1, 22016] + - [1238, 8939.97] + - - [256, 8976, 1, 4864] + - [1226, 9211.53] + - - [256, 8976, 1, 12288] + - [1222, 8065.15] + - - [1024, 9728, 1, 256] + - [1235, 9636.35] + - - [256, 8976, 1, 2048] + - [1223, 7001.43] + - - [1024, 10240, 1, 256] + - [1229, 9620.06] + - - [256, 8976, 1, 2304] + - [1227, 9509.84] + - - [1024, 7936, 1, 256] + - [1235, 9300.77] + - - [768, 256, 1, 2048] + - [1239, 6268.05] + - - [1024, 9984, 1, 256] + - [1235, 9477.38] + - - [1024, 13312, 1, 256] + - [1235, 9758.66] + - - [1024, 16128, 1, 256] + - [1229, 9722.0] + - - [1024, 8960, 1, 256] + - [1230, 9398.35] + - - [1024, 5120, 1, 256] + - [1236, 9315.6] + - - [1024, 11264, 1, 256] + - [1229, 9664.9] + - - [256, 8976, 1, 20480] + - [1237, 8279.97] + - - [1024, 20992, 1, 256] + - [1229, 9878.97] + - - [256, 8976, 1, 9472] + - [1231, 8991.06] + - - [256, 8976, 1, 8448] + - [1231, 8983.62] + - - [256, 8976, 1, 20992] + - [1232, 8942.21] + - - [256, 8976, 1, 10496] + - [1232, 8989.81] + - - [1024, 15104, 1, 256] + - [1230, 9676.11] + - - [1024, 6400, 1, 256] + - [1238, 9145.99] + - - [1024, 4096, 1, 256] + - [1231, 9124.35] + - - [256, 8976, 1, 2560] + - [1225, 8566.21] + - - [256, 8976, 1, 2816] + - [1227, 9496.94] + - - [1024, 7680, 1, 256] + - [1235, 9460.94] + - - [256, 8976, 1, 14336] + - [1232, 8226.9] + - - [256, 8976, 1, 6656] + - [1232, 8771.52] + - - [1024, 3072, 1, 256] + - [1232, 9077.04] + - - [256, 8976, 1, 5888] + - [1228, 9546.4] + - - [1024, 12288, 1, 256] + - [1229, 9690.91] + - - [256, 8976, 1, 26112] + - [1234, 8699.93] + - - [1024, 7424, 1, 256] + - [1236, 9256.94] + - - [256, 8976, 1, 14848] + - [1237, 8885.89] + - - [768, 215, 1, 2048] + - [1239, 5628.69] + - - [1024, 2560, 1, 256] + - [1232, 8820.93] + - - [256, 8976, 1, 19968] + - [1231, 8928.96] + - - [256, 8976, 1, 9984] + - [1231, 8993.22] + - - [1024, 4864, 1, 256] + - [1232, 8974.4] + - - [1024, 33536, 1, 256] + - [1235, 9943.17] + - - [256, 8976, 1, 15104] + - [1232, 8996.73] + - - [1024, 2048, 1, 256] + - [1230, 8462.76] + - - [256, 8976, 1, 8960] + - [1232, 8999.02] + - - [1024, 6144, 1, 256] + - [1237, 9359.77] + - - [1024, 14592, 1, 256] + - [1235, 9667.52] + - - [256, 8976, 1, 19712] + - [1231, 9020.21] + - - [1024, 11520, 1, 256] + - [1230, 9527.8] + - - [1024, 5632, 1, 256] + - [1229, 9297.3] + - - [256, 8976, 1, 11008] + - [1238, 8994.9] + - - [256, 8976, 1, 17152] + - [1232, 9003.9] + - - [256, 8976, 1, 3072] + - [1221, 8262.06] + - - [1024, 3840, 1, 256] + - [1238, 8671.99] + - - [1024, 14336, 1, 256] + - [1235, 9760.38] + - - [1024, 20480, 1, 256] + - [1229, 9887.95] + - - [1024, 23552, 1, 256] + - [1229, 9890.56] + - - [256, 8976, 1, 7168] + - [1224, 8478.44] + - - [1024, 13568, 1, 256] + - [1229, 9654.74] + - - [1024, 4608, 1, 256] + - [1237, 9218.35] + - - [256, 8976, 1, 10240] + - [1222, 8076.26] + - - [1024, 8704, 1, 256] + - [1231, 9475.6] + - - [1024, 11008, 1, 256] + - [1235, 9525.06] + - - [1024, 8448, 1, 256] + - [1229, 9352.26] + - - [256, 8976, 1, 44505] + - [1233, 8430.33] - null diff --git a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Alik_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Alik_Bljk_SB.yaml index 1eac3ee28..eb99e9a3c 100644 --- a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Alik_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Alik_Bljk_SB.yaml @@ -39633,8 +39633,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -39797,8 +39797,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -39961,8 +39961,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40125,8 +40125,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40289,8 +40289,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40453,8 +40453,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40617,8 +40617,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40781,8 +40781,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -40945,8 +40945,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41109,8 +41109,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41273,8 +41273,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41437,8 +41437,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41601,8 +41601,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41765,8 +41765,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -41925,8 +41925,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42089,8 +42089,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42253,8 +42253,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42417,8 +42417,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42581,8 +42581,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42745,8 +42745,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -42909,8 +42909,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43073,8 +43073,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43237,8 +43237,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43401,8 +43401,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43566,8 +43566,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43733,8 +43733,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -43898,8 +43898,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44061,8 +44061,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44226,8 +44226,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44393,8 +44393,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44558,8 +44558,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44721,8 +44721,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -44886,8 +44886,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45053,8 +45053,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45218,8 +45218,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45381,8 +45381,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45546,8 +45546,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45713,8 +45713,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -45878,8 +45878,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46041,8 +46041,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46206,8 +46206,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46371,8 +46371,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46538,8 +46538,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46703,8 +46703,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -46868,8 +46868,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47033,8 +47033,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47198,8 +47198,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47361,8 +47361,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47526,8 +47526,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47693,8 +47693,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -47858,8 +47858,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48021,8 +48021,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48186,8 +48186,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48353,8 +48353,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48518,8 +48518,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48681,8 +48681,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -48848,8 +48848,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49011,8 +49011,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49178,8 +49178,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49341,8 +49341,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49502,8 +49502,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49665,8 +49665,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49826,8 +49826,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -49987,8 +49987,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50146,8 +50146,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50309,8 +50309,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50468,8 +50468,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50631,8 +50631,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50790,8 +50790,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -50953,8 +50953,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51112,8 +51112,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51275,8 +51275,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51434,8 +51434,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51597,8 +51597,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51758,8 +51758,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -51917,8 +51917,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52080,8 +52080,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52239,8 +52239,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52400,8 +52400,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52561,8 +52561,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52728,8 +52728,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -52897,8 +52897,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -53064,8 +53064,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -53229,8 +53229,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -53396,8 +53396,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -53445,24 +53445,24 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -53470,32 +53470,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2304 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53504,9 +53509,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53514,26 +53519,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53543,6 +53556,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -53552,6 +53566,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53566,39 +53581,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 341 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW02_GSU32_SNLL0_TT04_02_VW02_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: [4, 2] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id002 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -53606,56 +53629,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53663,19 +53687,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -53683,6 +53714,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53692,6 +53724,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -53701,6 +53734,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53715,39 +53749,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 342 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_02_08 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 2 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 2 - SuppresssNoLoadLoop: false - ThreadTile: *id001 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 2, 8] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -53764,32 +53806,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 5120 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetB: 4096 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53798,9 +53841,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53808,26 +53851,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53837,6 +53888,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -53846,6 +53898,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -53860,45 +53913,53 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 343 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id002 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 64 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -53909,36 +53970,33 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 32 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 13312 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1024 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -53947,9 +54005,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -53957,26 +54015,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -53986,6 +54052,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -53995,6 +54062,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54009,33 +54077,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 344 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id001 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id002 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -54048,40 +54124,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 12 - LSPB: 12 - LVCA: 16 - LVCB: 16 - LVPA: 12 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 768 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -54095,10 +54172,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 36 - MacroTile1: 48 - MacroTileA: 36 - MacroTileB: 48 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54106,19 +54183,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -54126,6 +54208,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54135,6 +54218,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54144,6 +54228,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54158,33 +54243,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 345 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT036x048x16_GRVW01_GSU02_SNLL0_TT03_03_VW01_WG12_16_01 - SubGroup0: 12 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 12 + SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id004 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [12, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -54196,58 +54291,55 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 12 - LSPB: 12 - LVCA: 16 - LVCB: 16 - LVPA: 12 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 3456 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 576 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 36 - MacroTileA: 48 - MacroTileB: 36 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54255,26 +54347,32 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54284,6 +54382,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54293,6 +54392,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54307,33 +54407,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 346 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x036x16_GRVW01_GSU08_SNLL0_TT06_03_VW01_WG08_12_02 - SubGroup0: 8 - SubGroup1: 12 - SubGroupA: 8 - SubGroupB: 12 - SuppresssNoLoadLoop: false - ThreadTile: [6, 3] - ThreadTile0: 6 - ThreadTile1: 3 - ThreadTileA: 6 - ThreadTileB: 3 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id003 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -54346,8 +54456,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -54355,31 +54465,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 24 - LSPB: 24 - LVCA: 8 + LSPA: 64 + LSPB: 32 + LVCA: 4 LVCB: 8 - LVPA: 12 - LVPB: 12 - LdcEqualsLdd: false - LdsNumElements: 4608 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -54393,10 +54504,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 48 - MacroTile1: 48 - MacroTileA: 48 - MacroTileB: 48 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54404,19 +54515,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 6 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 192 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -54424,6 +54540,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54433,6 +54550,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54442,6 +54560,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54456,39 +54575,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 347 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW02_GSU08_SNLL0_TT06_04_VW02_WG08_12_02 - SubGroup0: 8 - SubGroup1: 12 - SubGroupA: 8 - SubGroupB: 12 - SuppresssNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id003 + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -54505,47 +54634,48 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 768 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 48 - MacroTile1: 48 - MacroTileA: 48 - MacroTileB: 48 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54553,19 +54683,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 9 - NumGlobalWriteVectorsPerThread: 9 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -54573,6 +54710,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54582,6 +54720,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54591,6 +54730,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54605,85 +54745,94 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 348 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW01_GSU08_SNLL0_TT03_03_VW01_WG16_16_01 - SubGroup0: 16 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_LPA0_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id004 - ThreadTile0: 3 - ThreadTile1: 3 - ThreadTileA: 3 - ThreadTileB: 3 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 - LSPA: 16 - LSPB: 8 - LVCA: 4 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 LVPA: 8 LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 832 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -54691,10 +54840,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54702,19 +54851,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -54722,6 +54878,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54731,6 +54888,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54740,6 +54898,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54754,48 +54913,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 349 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id009 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -54806,33 +54973,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 16 - LVCA: 4 - LVCB: 4 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 8 LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -54840,10 +55008,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -54851,19 +55019,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -54871,6 +55046,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -54880,6 +55056,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -54889,6 +55066,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -54903,85 +55081,94 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 350 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 - LSPA: 16 - LSPB: 16 - LVCA: 4 - LVCB: 4 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 LVPA: 8 LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -54990,9 +55177,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55000,19 +55187,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -55020,6 +55214,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55029,6 +55224,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55038,6 +55234,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55052,96 +55249,105 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 351 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 - ThreadTile0: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55149,19 +55355,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -55169,6 +55380,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55178,6 +55390,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55187,6 +55400,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55201,46 +55415,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 352 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x16_SE_EPS1_FL1_GRVW2_LPA0_LPB0_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id006 + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -55249,44 +55473,45 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 8 - LSCB: 8 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 16 - LVCA: 4 - LVCB: 8 + LVCA: 8 + LVCB: 16 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 MacroTile1: 16 MacroTileA: 32 @@ -55298,19 +55523,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -55318,6 +55550,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55327,6 +55560,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55336,6 +55570,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55350,35 +55585,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 353 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -55388,9 +55631,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -55398,31 +55641,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 16 - LSPB: 16 - LVCA: 4 + LSPA: 96 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LVPA: 24 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -55436,10 +55680,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55447,19 +55691,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -55467,6 +55716,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55476,6 +55726,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55485,6 +55736,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55499,35 +55751,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 354 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id006 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -55537,9 +55799,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -55552,26 +55814,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 + LSPA: 64 + LSPB: 128 + LVCA: 4 LVCB: 2 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -55585,10 +55848,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55596,26 +55859,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55625,6 +55896,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55634,6 +55906,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55648,35 +55921,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 355 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id006 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -55686,41 +55967,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 16 + LSPA: 64 + LSPB: 128 LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -55734,10 +56016,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55745,19 +56027,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -55765,6 +56054,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55774,6 +56064,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55783,6 +56074,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55797,75 +56089,84 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 356 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 3200 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -55882,11 +56183,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -55894,26 +56195,34 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -55923,6 +56232,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -55932,6 +56242,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -55946,35 +56257,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 357 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -55984,41 +56303,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -56032,10 +56352,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56043,19 +56363,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56063,6 +56390,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56072,6 +56400,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56081,6 +56410,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56095,79 +56425,88 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 358 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id009 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -56180,11 +56519,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56192,19 +56531,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56212,6 +56556,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56221,6 +56566,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56230,6 +56576,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56244,35 +56591,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 359 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -56282,41 +56639,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -56330,10 +56688,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56341,19 +56699,26 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56361,6 +56726,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56370,6 +56736,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56379,6 +56746,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56393,48 +56761,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 360 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id006 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -56446,26 +56822,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 4 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -56478,11 +56855,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56490,19 +56867,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56510,6 +56892,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56519,6 +56902,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56528,6 +56912,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56542,35 +56927,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 361 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 - ThreadTile0: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id006 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -56580,58 +56975,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1792 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -56639,19 +57035,24 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56659,6 +57060,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56668,6 +57070,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56677,6 +57080,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -56691,35 +57095,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 362 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -56729,8 +57143,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -56744,22 +57158,23 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 3328 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -56778,29 +57193,38 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56808,6 +57232,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56817,6 +57242,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56826,8 +57252,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -56840,35 +57268,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 363 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id010 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id008 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -56878,37 +57314,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 3584 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -56927,29 +57364,38 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -56957,6 +57403,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -56966,6 +57413,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -56975,8 +57423,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -56989,35 +57439,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 364 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id008 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -57027,37 +57485,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 3200 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -57076,29 +57535,38 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -57106,6 +57574,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57115,6 +57584,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57124,8 +57594,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57138,35 +57610,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 365 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id011 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -57176,41 +57656,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -57224,30 +57705,39 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -57255,6 +57745,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57264,6 +57755,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57273,8 +57765,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57287,35 +57781,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 366 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id009 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -57325,8 +57827,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -57339,27 +57841,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -57373,30 +57876,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -57404,6 +57914,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57413,6 +57924,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57422,8 +57934,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57436,35 +57950,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 367 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id005 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id008 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -57474,8 +57998,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -57488,23 +58012,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + InterleaveAlpha: 0 + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 3584 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -57523,29 +58048,36 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -57553,6 +58085,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57562,6 +58095,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57571,8 +58105,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57585,48 +58121,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 368 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id007 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id008 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -57634,67 +58180,75 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 32 - LVPB: 32 - LdcEqualsLdd: false + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -57702,6 +58256,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57711,6 +58266,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57720,8 +58276,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57734,14 +58292,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 369 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -57752,105 +58317,113 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdcEqualsLdd: true + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -57860,6 +58433,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -57869,8 +58443,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -57883,39 +58459,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 370 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_PGR0_PLR1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -57923,76 +58509,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 8 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 32 - LVPB: 32 - LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58000,6 +58596,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58009,6 +58606,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58018,8 +58616,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58032,33 +58632,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 371 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU08_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58081,67 +58689,77 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58149,6 +58767,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58158,6 +58777,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58167,8 +58787,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58181,39 +58803,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 372 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -58221,7 +58851,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -58230,36 +58860,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2048 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -58267,37 +58898,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58307,6 +58948,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58316,8 +58958,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58330,33 +58974,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 373 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58379,74 +59031,85 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58456,6 +59119,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58465,8 +59129,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58479,33 +59145,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 374 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58519,76 +59193,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58596,6 +59280,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58605,6 +59290,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58614,8 +59300,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58628,33 +59316,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 375 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58668,76 +59364,86 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58745,6 +59451,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58754,6 +59461,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58763,8 +59471,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58777,48 +59487,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 376 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -58826,36 +59544,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1792 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -58863,30 +59582,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -58894,6 +59620,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -58903,6 +59630,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -58912,8 +59640,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -58926,33 +59656,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 377 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -58965,77 +59705,85 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 3328 + LdcEqualsLdd: true + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59043,6 +59791,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59052,6 +59801,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59061,8 +59811,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59075,47 +59827,57 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 378 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -59124,36 +59886,37 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -59161,37 +59924,45 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59201,6 +59972,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59210,8 +59982,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59224,33 +59998,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 379 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 1 - WorkGroupMappingType: B + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -59263,77 +60047,85 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59341,6 +60133,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59350,6 +60143,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59359,8 +60153,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59373,33 +60169,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 380 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -59412,7 +60218,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -59422,67 +60228,75 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -59490,6 +60304,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59499,6 +60314,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59508,8 +60324,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59522,39 +60340,49 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 381 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -59562,33 +60390,34 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 @@ -59600,7 +60429,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -59608,37 +60437,47 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59648,6 +60487,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59657,8 +60497,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59671,39 +60513,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 382 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -59711,8 +60561,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -59720,74 +60570,85 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59797,6 +60658,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59806,8 +60668,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59820,39 +60684,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 383 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -59860,65 +60732,68 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -59926,17 +60801,25 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -59946,6 +60829,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -59955,8 +60839,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -59969,39 +60855,47 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 384 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -60009,45 +60903,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -60055,19 +60950,21 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -60075,17 +60972,25 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60095,6 +61000,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60104,8 +61010,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60118,33 +61026,41 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 385 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -60167,13 +61083,14 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -60183,8 +61100,8 @@ LVCB: 4 LVPA: 16 LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 16384 + LdcEqualsLdd: true + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 @@ -60196,14 +61113,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -60211,7 +61128,9 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -60224,10 +61143,17 @@ NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60235,6 +61161,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60244,6 +61171,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60253,8 +61181,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60267,75 +61197,84 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 386 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 16 + LSPB: 64 LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -60345,45 +61284,53 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60393,6 +61340,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60402,8 +61350,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60416,85 +61366,96 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 387 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_04_04 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id020 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -60502,37 +61463,45 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60542,6 +61511,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60551,8 +61521,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60565,73 +61537,84 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 388 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 4096 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 @@ -60643,7 +61626,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -60651,30 +61634,37 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -60682,6 +61672,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -60691,6 +61682,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -60700,8 +61692,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60714,80 +61708,21597 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 SolutionIndex: 389 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 - LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 390 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 391 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 392 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 393 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 394 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 395 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 396 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 397 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 398 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 399 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 400 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 401 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 402 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 403 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 404 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 405 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 406 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 407 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 408 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 409 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 410 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 411 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 412 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 413 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW02_GSU32_SNLL0_TT04_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 8 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 414 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_02_08 + SubGroup0: 16 + SubGroup1: 2 + SubGroupA: 16 + SubGroupB: 2 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 2, 8] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 5120 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 415 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 32 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 13312 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 416 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 12 + LSPB: 12 + LVCA: 16 + LVCB: 16 + LVPA: 12 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3392 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 36 + MacroTile1: 48 + MacroTileA: 36 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 417 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT036x048x16_GRVW01_GSU02_SNLL0_TT03_03_VW01_WG12_16_01 + SubGroup0: 12 + SubGroup1: 16 + SubGroupA: 12 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id004 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [12, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 12 + LSPB: 12 + LVCA: 16 + LVCB: 16 + LVPA: 12 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 3456 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 36 + MacroTileA: 48 + MacroTileB: 36 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 418 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x036x16_GRVW01_GSU08_SNLL0_TT06_03_VW01_WG08_12_02 + SubGroup0: 8 + SubGroup1: 12 + SubGroupA: 8 + SubGroupB: 12 + SuppresssNoLoadLoop: false + ThreadTile: [6, 3] + ThreadTile0: 6 + ThreadTile1: 3 + ThreadTileA: 6 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id003 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 24 + LSPB: 24 + LVCA: 8 + LVCB: 8 + LVPA: 12 + LVPB: 12 + LdcEqualsLdd: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 48 + MacroTile1: 48 + MacroTileA: 48 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 192 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 419 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW02_GSU08_SNLL0_TT06_04_VW02_WG08_12_02 + SubGroup0: 8 + SubGroup1: 12 + SubGroupA: 8 + SubGroupB: 12 + SuppresssNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id003 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 768 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 48 + MacroTile1: 48 + MacroTileA: 48 + MacroTileB: 48 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 9 + NumGlobalWriteVectorsPerThread: 9 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 420 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW01_GSU08_SNLL0_TT03_03_VW01_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id004 + ThreadTile0: 3 + ThreadTile1: 3 + ThreadTileA: 3 + ThreadTileB: 3 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 421 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id009 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 422 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 423 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 424 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 425 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 426 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 427 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 428 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 429 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 430 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id009 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 431 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 432 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 433 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id006 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 434 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 435 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id010 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 436 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 437 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id011 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 438 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id009 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 439 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id005 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 440 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id007 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id008 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 441 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 442 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 443 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU08_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 444 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 445 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 446 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 447 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 448 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 449 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 450 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 451 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 452 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 453 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 454 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 455 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 456 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 457 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 458 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 459 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id020 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 460 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 461 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 462 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 463 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 464 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 465 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 466 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 467 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 468 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 469 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id016 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 470 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 471 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 472 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 473 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 474 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 475 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 476 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 477 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 12 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 478 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 479 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 480 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 481 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 13312 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 482 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 483 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 484 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 485 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 486 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 487 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id015 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id020 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 488 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id019 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id014 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 489 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 490 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 491 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id018 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id013 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 492 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: *id012 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id017 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 493 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 494 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 495 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL0_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: [6, 8] + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 496 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id024 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 497 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 498 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id023 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 499 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id024 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 500 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 501 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: false + ThreadTile: *id022 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id021 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 502 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 503 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 504 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 505 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 506 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id029 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 507 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 508 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id031 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 509 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 510 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 511 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 512 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id029 + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 513 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id030 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 514 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id031 + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 515 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 516 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 3072 + LdsNumElementsAlignedB: 3072 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 3072 + LdsOffsetB_Blk: 11264 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 24 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 517 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 518 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 519 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 520 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 521 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id027 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 522 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id028 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 523 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: *id025 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id026 + WorkGroupMapping: 8 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 4 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 4 + LSCB: 4 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 524 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id032 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 2 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 2 + LSCB: 2 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 525 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id032 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 526 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: true + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id035 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 527 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 528 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 529 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 530 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: *id033 + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id034 + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 16 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -60863,14 +83374,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 390 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SolutionIndex: 531 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: *id015 + ThreadTile: *id033 ThreadTile0: 2 ThreadTile1: 2 ThreadTileA: 2 @@ -60881,13 +83392,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 + WorkGroup: *id035 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -60901,9 +83411,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -60911,33 +83420,325 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 1 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 532 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_08_02_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id038 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 1 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 533 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id037 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6752 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -60948,10 +83749,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -60960,14 +83761,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -61012,31 +83811,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 391 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SolutionIndex: 534 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: *id036 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 + VectorWidth: 1 + WorkGroup: *id037 WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -61044,15 +83843,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -61060,37 +83858,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -61098,9 +83896,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -61109,14 +83907,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -61161,31 +83957,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 392 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + SolutionIndex: 535 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM08 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 4 + ThreadTile: *id036 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: *id037 + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -61193,53 +83989,52 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -61247,10 +84042,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61258,15 +84053,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -61310,31 +84103,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 393 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + SolutionIndex: 536 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW01_GSU01_LPA02_LPB02_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: *id039 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id014 - WorkGroupMapping: 8 + WorkGroup: *id037 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -61342,53 +84135,52 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -61397,9 +84189,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61407,15 +84199,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -61459,31 +84249,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 394 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionIndex: 537 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW01_GSU08_LPA02_LPB02_PGR1_PLR1_TT02_02_USFGRO01_VW02_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: *id037 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -61497,43 +84287,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -61556,15 +84345,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -61608,31 +84395,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 395 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 + SolutionIndex: 538 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_08_02_WGM01 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 + ThreadTile: *id036 + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + WorkGroup: *id038 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -61646,8 +84433,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -61656,11 +84442,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -61674,30 +84460,30 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61705,14 +84491,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -61757,31 +84541,31 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 396 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 + SolutionIndex: 539 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG32_08_01_WGM01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: *id039 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id040 + WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -61795,58 +84579,57 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -61854,13 +84637,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -61906,33 +84687,34 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 397 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 + SolutionIndex: 540 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id016 + ThreadTile: *id039 ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: *id040 + WorkGroupMapping: 1 WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -61945,57 +84727,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -62003,15 +84785,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -62055,33 +84840,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 398 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 541 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62094,7 +84889,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -62104,47 +84899,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -62152,15 +84947,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -62204,33 +85002,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 399 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 542 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62243,57 +85051,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -62301,15 +85109,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -62353,33 +85164,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 400 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 543 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62392,57 +85213,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -62450,15 +85271,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -62502,33 +85326,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 401 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 544 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -62541,57 +85375,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -62599,15 +85433,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -62651,46 +85488,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 402 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 545 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id017 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -62700,47 +85547,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 64 LSPB: 64 - LVCA: 2 - LVCB: 2 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -62748,15 +85595,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -62800,46 +85650,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 403 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 546 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -62849,43 +85709,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 64 LSPB: 64 - LVCA: 2 - LVCB: 2 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -62897,15 +85757,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -62949,46 +85812,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 404 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 547 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -62998,43 +85871,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 64 LSPB: 64 - LVCA: 2 - LVCB: 2 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -63046,15 +85919,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -63098,47 +85974,57 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 405 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 548 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -63147,47 +86033,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 + LSCA: 16 + LSCB: 16 + LSPA: 32 LSPB: 64 - LVCA: 2 - LVCB: 2 + LVCA: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 12 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -63195,15 +86081,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -63247,46 +86136,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 406 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 549 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -63296,22 +86195,22 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -63325,7 +86224,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -63333,10 +86232,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -63344,15 +86243,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -63396,48 +86298,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 407 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 550 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -63445,26 +86357,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3328 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -63474,7 +86386,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -63482,9 +86394,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -63493,15 +86405,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -63545,46 +86460,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 408 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 551 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -63594,26 +86519,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -63623,18 +86548,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -63642,15 +86567,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -63694,46 +86622,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 409 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 552 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -63743,36 +86681,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13312 - LdsNumElementsAlignedA: 4096 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -63780,10 +86718,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -63793,13 +86731,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -63843,46 +86784,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 410 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 553 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -63892,47 +86843,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -63940,15 +86891,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -63992,46 +86946,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 411 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 554 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -64041,47 +87005,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -64089,20 +87049,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -64141,46 +87104,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 412 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 555 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 + WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -64190,47 +87163,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -64240,13 +87213,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -64290,46 +87266,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 413 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 556 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -64339,26 +87325,26 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -64368,7 +87354,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -64376,10 +87362,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -64387,15 +87373,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -64439,96 +87428,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 414 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 557 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id017 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -64536,15 +87535,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -64588,46 +87590,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 415 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id015 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 558 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id020 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -64637,43 +87649,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -64686,14 +87698,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -64737,46 +87752,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 416 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 559 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_8_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id019 - ThreadTile0: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id014 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -64786,46 +87811,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -64834,15 +87859,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -64886,48 +87914,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 417 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 560 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -64935,46 +87973,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 LSPB: 32 - LVCA: 8 + LVCA: 4 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -64983,15 +88021,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -65035,46 +88076,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 418 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 561 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 32 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -65084,47 +88135,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -65132,15 +88183,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -65184,46 +88238,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 419 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id018 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 562 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id013 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -65233,36 +88297,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1600 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -65281,20 +88341,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -65333,35 +88396,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 420 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_08_02 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 563 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id012 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id017 + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -65372,7 +88445,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -65385,11 +88458,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 @@ -65399,14 +88472,10 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -65431,19 +88500,22 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -65482,35 +88554,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 421 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 564 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -65520,8 +88602,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -65531,31 +88613,31 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -65569,9 +88651,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -65579,15 +88661,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -65631,35 +88716,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 422 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 565 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id023 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -65669,42 +88764,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -65717,10 +88812,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -65728,15 +88823,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -65780,35 +88878,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 423 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL0_TT06_08_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 566 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: [6, 8] - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id021 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -65818,10 +88926,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -65832,28 +88940,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -65867,9 +88975,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -65877,15 +88985,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -65929,35 +89040,45 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 424 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id024 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 567 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -65967,8 +89088,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -65978,31 +89099,31 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -66016,9 +89137,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66027,14 +89148,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66078,48 +89202,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 425 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 568 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -66131,27 +89265,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 64 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -66163,11 +89297,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66175,15 +89309,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66227,46 +89364,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 426 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id023 - ThreadTile0: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 569 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x64x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 - WorkGroupMapping: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -66280,27 +89427,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -66312,11 +89459,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66324,15 +89471,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66376,46 +89526,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 427 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 570 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id024 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -66425,31 +89585,31 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -66461,7 +89621,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -66475,13 +89635,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66525,46 +89688,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 428 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 571 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -66578,27 +89751,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -66610,7 +89783,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -66624,13 +89797,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66674,46 +89850,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 429 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 572 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: *id022 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id021 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -66727,27 +89913,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -66759,11 +89945,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66771,15 +89957,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -66823,46 +90012,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 430 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 573 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -66876,27 +90075,23 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetB: 2112 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -66908,11 +90103,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -66920,20 +90115,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -66972,33 +90170,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 431 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 574 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -67011,7 +90219,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -67025,8 +90233,8 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -67038,14 +90246,14 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -67058,9 +90266,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -67069,15 +90277,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67121,33 +90332,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 432 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 575 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -67160,7 +90381,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -67174,8 +90395,8 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -67187,14 +90408,14 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -67208,9 +90429,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67218,15 +90439,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67270,33 +90494,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 433 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 576 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -67308,9 +90542,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -67318,32 +90552,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 32 - LVCA: 8 + LVCA: 4 LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2624 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 - LdsPadA: 0 + LdsOffsetB: 2112 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -67356,10 +90586,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67367,20 +90597,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -67419,33 +90652,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 434 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id029 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 577 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -67458,7 +90701,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -67468,31 +90711,31 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -67505,10 +90748,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67516,15 +90759,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67568,80 +90814,90 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 435 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 578 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id030 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsPadA: 4 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -67653,11 +90909,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67665,15 +90921,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67717,48 +90976,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 436 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id031 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 579 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -67769,29 +91038,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3600 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -67802,11 +91071,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67814,15 +91083,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -67866,33 +91138,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 437 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 580 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -67905,7 +91187,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -67915,32 +91197,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -67953,9 +91235,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67964,14 +91246,17 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68015,33 +91300,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 438 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 581 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -68054,7 +91349,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -68064,12 +91359,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -68081,15 +91376,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 6176 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68102,9 +91397,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68112,15 +91407,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68164,33 +91462,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 439 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 582 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -68203,42 +91511,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6176 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68250,10 +91558,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68261,15 +91569,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68313,33 +91624,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 440 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 583 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id029 - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -68352,7 +91673,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -68362,32 +91683,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68399,10 +91720,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68410,15 +91731,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68462,47 +91786,57 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 441 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 584 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id030 - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -68510,33 +91844,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68547,11 +91881,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68559,15 +91893,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68611,33 +91948,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 442 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id031 - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 585 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id026 + VectorWidth: 4 + WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -68650,7 +91997,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -68664,28 +92011,28 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68697,10 +92044,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68708,15 +92055,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68760,46 +92110,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 443 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 586 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -68813,28 +92173,24 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68845,11 +92201,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68857,20 +92213,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -68909,46 +92268,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 444 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 587 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -68958,32 +92327,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 3072 - LdsNumElementsAlignedB: 3072 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 3072 - LdsOffsetB_Blk: 11264 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -68994,27 +92363,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 MinGlobalWriteVectorWidth: 1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69058,46 +92430,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 445 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 588 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69107,32 +92489,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -69143,7 +92525,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -69157,13 +92539,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69207,46 +92592,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 446 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 589 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69260,28 +92655,28 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -69292,7 +92687,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -69306,13 +92701,16 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69356,46 +92754,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 447 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 590 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69409,28 +92817,28 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 13376 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 4160 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 9216 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -69441,11 +92849,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69455,13 +92863,16 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69505,46 +92916,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 448 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 591 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_8_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 - WorkGroupMapping: 1 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69558,28 +92979,28 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -69590,11 +93011,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69602,15 +93023,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69654,48 +93078,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 449 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 592 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id027 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -69707,28 +93141,24 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 LSPB: 32 - LVCA: 8 + LVCA: 4 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14336 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 2624 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -69739,11 +93169,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69751,20 +93181,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -69803,46 +93236,56 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 450 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id028 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 593 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [32, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -69852,32 +93295,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -69888,11 +93331,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69900,15 +93343,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69952,81 +93398,91 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 451 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 594 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: *id025 - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id026 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 4 - LSCB: 4 - LSPA: 16 - LSPB: 16 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -70037,11 +93493,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70049,15 +93505,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70101,48 +93560,58 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 452 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 595 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id032 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -70157,25 +93626,25 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 2 - LSCB: 2 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -70186,10 +93655,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -70200,13 +93669,16 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70250,33 +93722,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 453 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 - SubGroup0: 8 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 596 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: *id032 - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -70289,57 +93771,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 12864 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 4160 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 512 + LdsOffsetB_Blk: 8704 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70347,15 +93829,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70399,96 +93884,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 454 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 597 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: true - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70496,15 +93991,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70548,96 +94046,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 455 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 598 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70645,15 +94153,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70697,96 +94208,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 456 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 599 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70794,15 +94315,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70846,96 +94370,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 457 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 600 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70943,15 +94477,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70995,96 +94532,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 458 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 601 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id034 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 9216 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71092,15 +94639,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71144,94 +94694,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 459 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 602 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: *id033 - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: *id035 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: false + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 9216 + LdsPadA: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71239,13 +94801,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71289,13 +94856,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 460 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_08_02_WGM01 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - ThreadTile: *id036 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 603 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -71305,79 +94880,82 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id038 - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 32 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3408 LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 LdsOffsetB_Blk: 3136 - LdsPadA: 1 - LdsPadB: 1 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71385,13 +94963,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71435,13 +95018,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 461 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id036 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 604 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -71451,79 +95042,82 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id037 - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 2112 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71531,13 +95125,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71581,13 +95180,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 462 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 605 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id036 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -71597,79 +95204,82 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id037 + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 6752 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71677,13 +95287,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71727,13 +95342,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 463 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM08 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id036 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 606 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -71743,39 +95366,42 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: *id037 - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -71783,39 +95409,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2112 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71823,13 +95449,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71873,95 +95504,106 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 464 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW01_GSU01_LPA02_LPB02_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_04_04_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 607 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - ThreadTile: *id039 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id037 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3456 + LdsNumElements: 7264 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetB_Blk: 5184 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71969,13 +95611,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72019,33 +95666,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 465 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW01_GSU08_LPA02_LPB02_PGR1_PLR1_TT02_02_USFGRO01_VW02_WG16_04_04_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 608 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: *id037 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72057,56 +95714,57 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1088 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72115,13 +95773,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72165,13 +95828,21 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 466 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_08_02_WGM01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 609 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - ThreadTile: *id036 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 @@ -72181,17 +95852,19 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id038 + VectorWidth: 2 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72203,42 +95876,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -72250,10 +95924,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72261,13 +95935,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72311,33 +95990,43 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 467 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG32_08_01_WGM01 - SubGroup0: 32 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 610 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - ThreadTile: *id039 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id040 + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -72349,6 +96038,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -72359,12 +96049,12 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 @@ -72376,15 +96066,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -72396,9 +96082,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72407,18 +96093,23 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -72457,26 +96148,35 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 468 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - ThreadTile: *id039 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 611 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU5_LPA2_LPB2_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: *id040 + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -72507,7 +96207,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -72524,15 +96224,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -72544,10 +96244,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72555,12 +96255,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -72614,8 +96314,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 469 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 612 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -72623,12 +96323,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -72669,7 +96369,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -72679,22 +96379,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -72706,10 +96406,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72717,13 +96417,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -72776,20 +96476,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 470 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 613 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -72797,7 +96497,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -72831,7 +96531,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -72848,15 +96548,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -72868,9 +96568,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72879,11 +96579,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -72938,28 +96638,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 471 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 614 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -72993,7 +96693,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -73003,22 +96703,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73030,10 +96730,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73041,13 +96741,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -73100,29 +96800,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 472 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 615 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -73155,7 +96855,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -73172,15 +96872,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73262,8 +96962,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 473 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 616 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73271,7 +96971,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -73284,7 +96984,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -73317,7 +97017,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -73334,15 +97034,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73355,9 +97055,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73365,12 +97065,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -73424,8 +97124,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 474 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 617 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73433,12 +97133,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 8] + SuppressNoLoadLoop: true + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -73446,7 +97146,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -73479,7 +97179,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -73496,15 +97196,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73517,9 +97217,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73527,12 +97227,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -73586,20 +97286,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 475 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 618 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -73607,7 +97307,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -73632,7 +97332,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -73641,7 +97341,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -73651,22 +97351,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73678,10 +97378,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73694,7 +97394,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -73748,15 +97448,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 476 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 619 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -73769,8 +97469,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -73794,7 +97494,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -73813,22 +97513,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 64 - LVCA: 8 + LVCA: 4 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -73840,9 +97540,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -73851,8 +97551,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -73910,8 +97610,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 477 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM64 + SolutionIndex: 620 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -73920,10 +97620,10 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -73932,7 +97632,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -73982,15 +97682,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74072,8 +97772,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 478 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 621 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -74094,7 +97794,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -74110,7 +97810,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -74119,7 +97819,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -74127,32 +97827,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 32 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 32 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74163,11 +97863,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74175,12 +97875,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -74234,31 +97934,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 479 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + SolutionIndex: 622 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -74292,7 +97992,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -74306,15 +98006,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74326,10 +98026,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74337,12 +98037,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -74396,8 +98096,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 480 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM1 + SolutionIndex: 623 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -74406,19 +98106,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -74451,10 +98151,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -74468,15 +98168,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74489,9 +98189,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74499,12 +98199,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -74558,8 +98258,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 481 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 624 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_8_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -74567,20 +98267,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -74613,10 +98313,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -74630,15 +98330,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74650,9 +98350,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -74661,11 +98361,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -74720,8 +98420,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 482 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 625 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -74729,20 +98429,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -74764,7 +98464,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -74775,10 +98475,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -74792,11 +98492,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 2048 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74820,7 +98524,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -74835,7 +98539,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -74878,29 +98582,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 483 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 626 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -74933,10 +98637,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -74950,15 +98654,15 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -74982,7 +98686,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -75040,8 +98744,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 484 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 627 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -75049,7 +98753,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -75060,7 +98764,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 8, 1] WorkGroupMapping: 64 WorkGroupMappingType: B @@ -75095,10 +98799,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -75112,15 +98816,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6240 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75132,10 +98836,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75143,12 +98847,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -75202,8 +98906,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 485 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 628 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -75211,18 +98915,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 64 WorkGroupMappingType: B @@ -75240,7 +98944,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -75257,32 +98961,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 4160 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 9280 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75293,11 +98997,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75305,12 +99009,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -75364,31 +99068,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 486 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + SolutionIndex: 629 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -75402,7 +99106,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -75427,24 +99131,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 - LSPB: 64 - LVCA: 2 - LVCB: 2 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 512 + LdsNumElements: 14464 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 4160 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 LdsPadA: 2 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75455,11 +99159,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75471,9 +99175,9 @@ NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -75526,31 +99230,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 487 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_8_VW2_WG16_8_1_WGM8 + SolutionIndex: 630 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -75564,7 +99268,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -75581,7 +99285,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -75589,24 +99293,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 LdsPadA: 2 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75617,11 +99321,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75634,8 +99338,8 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -75688,15 +99392,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 488 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 631 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -75709,10 +99413,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -75726,7 +99430,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -75743,32 +99447,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 16 + LVCA: 2 + LVCB: 4 + LVPA: 32 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3424 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75779,7 +99483,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 32 MacroTileA: 128 @@ -75792,10 +99496,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -75850,8 +99554,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 489 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + SolutionIndex: 632 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -75859,7 +99563,7 @@ SubGroup1: 8 SubGroupA: 32 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -75870,11 +99574,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -75888,7 +99592,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -75897,7 +99601,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -75905,32 +99609,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3680 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -75941,10 +99645,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -75953,7 +99657,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -76012,31 +99716,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 490 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 633 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -76056,7 +99760,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -76070,7 +99774,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -76084,11 +99788,15 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1600 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76112,7 +99820,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -76127,7 +99835,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -76170,8 +99878,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 491 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + SolutionIndex: 634 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -76179,7 +99887,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -76190,9 +99898,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -76214,9 +99922,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -76228,25 +99936,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76258,10 +99970,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76270,11 +99982,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -76285,7 +99997,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -76328,16 +100040,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 492 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 635 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -76348,9 +100060,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -76383,32 +100095,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 6272 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76421,9 +100133,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76432,12 +100144,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -76490,16 +100202,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 493 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 + SolutionIndex: 636 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -76510,9 +100222,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -76545,10 +100257,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -76562,15 +100274,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76582,9 +100294,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -76593,11 +100305,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -76652,8 +100364,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 494 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 637 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -76661,20 +100373,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -76710,7 +100422,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -76724,15 +100436,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6720 + LdsNumElements: 6784 LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2112 LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 0 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76756,7 +100468,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -76814,8 +100526,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 495 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + SolutionIndex: 638 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -76834,9 +100546,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -76869,32 +100581,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -76906,10 +100618,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76917,13 +100629,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -76976,29 +100688,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 496 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + SolutionIndex: 639 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -77014,13 +100726,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 @@ -77039,24 +100751,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4736 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 4160 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77067,11 +100775,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 256 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 256 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77079,11 +100787,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -77095,7 +100803,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -77138,8 +100846,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 497 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x64x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG32_8_1_WGM1 + SolutionIndex: 640 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT8_4_VW4_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -77147,12 +100855,12 @@ SubGroup1: 8 SubGroupA: 32 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -77162,7 +100870,7 @@ WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -77176,7 +100884,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -77193,7 +100901,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 3 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -77201,24 +100909,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 128 - LVCA: 2 - LVCB: 2 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77229,11 +100937,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77241,11 +100949,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -77300,8 +101008,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 498 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 641 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA4_LPB4_PGR1_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -77309,12 +101017,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -77322,9 +101030,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -77365,22 +101073,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 3712 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 + LdsOffsetB_Blk: 3136 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77393,9 +101101,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77407,9 +101115,9 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -77462,15 +101170,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 499 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 642 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -77483,7 +101191,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -77534,15 +101242,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77555,9 +101263,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77565,12 +101273,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -77624,20 +101332,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 500 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 643 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -77645,7 +101353,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 32, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -77696,15 +101404,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77716,9 +101424,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -77727,11 +101435,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -77786,8 +101494,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 501 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 644 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -77796,10 +101504,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -77830,7 +101538,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -77858,11 +101566,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 2112 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -77874,10 +101586,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77889,8 +101601,8 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -77901,7 +101613,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -77944,16 +101656,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 502 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 645 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 @@ -77965,7 +101677,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -78016,15 +101728,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 7296 LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2112 LdsOffsetB_Blk: 6208 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78106,28 +101818,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 503 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 646 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -78178,15 +101890,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78198,10 +101910,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78209,12 +101921,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -78268,8 +101980,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 504 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 647 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -78278,11 +101990,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -78290,7 +102002,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -78340,11 +102052,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2624 + LdsNumElements: 2688 LdsOffsetA: 0 LdsOffsetB: 2112 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78426,8 +102138,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 505 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM64 + SolutionIndex: 648 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -78448,7 +102160,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -78473,7 +102185,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -78481,7 +102193,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -78491,22 +102203,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78518,9 +102230,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -78535,7 +102247,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -78588,16 +102300,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 506 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 649 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -78609,8 +102321,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -78626,7 +102338,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -78651,24 +102363,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 4 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78679,11 +102391,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78691,12 +102403,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -78750,31 +102462,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 507 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 650 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -78788,7 +102500,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -78797,7 +102509,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -78805,32 +102517,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 LVCB: 4 - LVPA: 32 - LVPB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3600 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 2 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -78841,11 +102553,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78853,13 +102565,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -78912,31 +102624,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 508 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 651 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -78967,10 +102679,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -78984,15 +102696,15 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3712 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 2 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79016,7 +102728,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -79074,8 +102786,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 509 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 652 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -79083,7 +102795,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -79094,9 +102806,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -79120,7 +102832,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -79129,32 +102841,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6176 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 2 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79166,10 +102878,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79178,11 +102890,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -79236,16 +102948,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 510 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 653 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -79256,9 +102968,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -79291,10 +103003,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -79308,15 +103020,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6176 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 - LdsPadB: 2 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79329,9 +103041,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79339,12 +103051,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -79398,28 +103110,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 511 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 654 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 32, 1] WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -79453,32 +103165,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -79491,9 +103203,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79502,12 +103214,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -79560,16 +103272,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 512 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + SolutionIndex: 655 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -79580,8 +103292,8 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -79598,7 +103310,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -79607,7 +103319,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -79623,23 +103335,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 + LSCA: 16 + LSCB: 16 + LSPA: 64 LSPB: 64 - LVCA: 2 + LVCA: 4 LVCB: 4 - LVPA: 32 - LVPB: 32 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -79651,11 +103363,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79668,7 +103380,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -79722,15 +103434,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 513 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 656 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 @@ -79743,10 +103455,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -79787,21 +103499,21 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -79814,10 +103526,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79825,13 +103537,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -79884,20 +103596,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 514 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 657 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -79905,8 +103617,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -79928,7 +103640,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -79956,10 +103668,14 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 2048 - LdsPadA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 10304 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -79973,9 +103689,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79983,12 +103699,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -79999,7 +103715,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -80042,8 +103758,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 515 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 658 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -80051,12 +103767,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -80064,7 +103780,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -80089,7 +103805,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -80097,7 +103813,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -80107,21 +103823,21 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -80134,9 +103850,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80151,7 +103867,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -80204,16 +103920,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 516 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 659 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -80225,8 +103941,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -80259,7 +103975,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -80276,14 +103992,14 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -80296,9 +104012,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80307,11 +104023,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -80366,29 +104082,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 517 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 660 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -80412,7 +104128,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -80421,7 +104137,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -80431,21 +104147,21 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 576 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 + LdsOffsetB: 576 + LdsOffsetB_Blk: 4672 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -80458,9 +104174,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -80469,8 +104185,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -80528,8 +104244,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 518 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + SolutionIndex: 661 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -80537,11 +104253,11 @@ SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -80550,7 +104266,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -80566,7 +104282,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80591,23 +104307,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 13376 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 13568 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 9216 - LdsPadA: 0 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -80619,11 +104335,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80631,8 +104347,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -80690,8 +104406,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 519 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_8_VW4_WG8_32_1_WGM8 + SolutionIndex: 662 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -80700,11 +104416,11 @@ SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -80712,9 +104428,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -80728,7 +104444,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80753,23 +104469,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -80781,11 +104497,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80793,11 +104509,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -80852,8 +104568,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 520 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 663 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -80862,11 +104578,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -80874,9 +104590,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -80890,16 +104606,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -80907,7 +104623,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 5 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -80915,19 +104631,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 32 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 4 + LVCA: 8 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2624 + LdsNumElements: 13568 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetB: 2048 - LdsPadA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 9344 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -80939,11 +104659,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80955,8 +104675,8 @@ NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -80967,7 +104687,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -81010,15 +104730,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 521 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 664 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -81031,10 +104751,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -81048,7 +104768,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -81065,7 +104785,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -81073,23 +104793,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -81101,11 +104821,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81118,8 +104838,8 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -81172,16 +104892,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 522 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 665 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -81193,10 +104913,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -81210,7 +104930,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -81227,7 +104947,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -81235,23 +104955,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 14592 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 - LdsPadA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -81263,11 +104983,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81275,12 +104995,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -81334,8 +105054,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 523 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 666 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -81343,12 +105063,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -81358,7 +105078,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -81372,7 +105092,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -81397,23 +105117,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -81425,11 +105145,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81442,8 +105162,8 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -81496,15 +105216,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 524 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 667 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -81517,10 +105237,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -81534,7 +105254,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -81542,7 +105262,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -81559,23 +105279,23 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 32 - LSPB: 64 + LSPB: 32 LVCA: 8 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 12864 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 14592 + LdsNumElementsAlignedA: 2176 + LdsNumElementsAlignedB: 4224 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 512 - LdsOffsetB_Blk: 8704 - LdsPadA: 0 + LdsOffsetB: 2176 + LdsOffsetB_Blk: 10368 + LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -81587,11 +105307,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81603,7 +105323,7 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 @@ -81658,15 +105378,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 525 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG8_32_1_WGM64 + SolutionIndex: 668 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 @@ -81679,10 +105399,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -81705,7 +105425,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -81716,7 +105436,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -81724,21 +105444,21 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81751,9 +105471,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81761,12 +105481,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -81793,6 +105513,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -81802,6 +105523,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -81820,29 +105542,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 526 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 669 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -81878,7 +105600,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -81892,15 +105614,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -81913,9 +105635,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81923,12 +105645,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -81955,6 +105677,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -81964,6 +105687,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -81982,8 +105706,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 527 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 670 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -81992,19 +105716,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -82020,7 +105744,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82029,7 +105753,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -82040,44 +105764,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82085,13 +105809,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82117,6 +105841,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82126,6 +105851,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -82144,31 +105870,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 528 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + SolutionIndex: 671 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -82182,7 +105908,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82199,32 +105925,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 2 + LVCB: 2 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -82235,10 +105961,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -82248,12 +105974,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82279,6 +106005,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82288,6 +106015,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -82306,16 +106034,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 529 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 672 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -82326,11 +106054,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -82344,7 +106072,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82353,7 +106081,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -82361,47 +106089,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 - LSPB: 32 - LVCA: 8 + LSPB: 16 + LVCA: 4 LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 9216 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 2 MacroTile0: 32 - MacroTile1: 128 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 128 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82409,13 +106137,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82441,6 +106169,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82450,6 +106179,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -82468,31 +106198,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 530 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM8 + SolutionIndex: 673 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW2_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] + VectorWidth: 2 + WorkGroup: [4, 4, 8] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -82515,7 +106245,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -82526,44 +106256,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 8 LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 9216 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82571,13 +106301,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82603,6 +106333,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82612,6 +106343,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -82630,29 +106362,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 531 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 674 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -82668,7 +106400,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82688,44 +106420,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 32 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3408 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82733,13 +106465,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82765,6 +106497,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82774,6 +106507,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -82792,31 +106526,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 532 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 + SolutionIndex: 675 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -82830,7 +106564,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -82847,7 +106581,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -82855,24 +106589,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 2 + LVCB: 2 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -82883,10 +106617,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -82899,9 +106633,9 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -82927,6 +106661,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -82936,6 +106671,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -82954,16 +106690,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 533 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 676 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -82975,10 +106711,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -82992,7 +106728,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -83000,8 +106736,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -83009,47 +106745,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 64 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 LVCA: 8 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 8 + MacroTileA: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83057,13 +106793,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -83089,6 +106825,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83098,6 +106835,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83116,31 +106854,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 534 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + SolutionIndex: 677 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -83154,7 +106892,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -83171,7 +106909,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -83179,39 +106917,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83219,13 +106957,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -83251,6 +106989,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83260,6 +106999,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83278,20 +107018,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 535 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 678 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT8_4_VW2_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -83299,10 +107039,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -83316,7 +107056,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -83325,7 +107065,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -83333,47 +107073,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 8 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83381,13 +107121,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -83413,6 +107153,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83422,6 +107163,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83440,31 +107182,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 536 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM1 + SolutionIndex: 679 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 8] + SubGroup0: 8 + SubGroup1: 2 + SubGroupA: 8 + SubGroupB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 2, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -83478,7 +107220,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -83495,32 +107237,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -83531,11 +107273,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83543,11 +107285,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -83575,6 +107317,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83584,6 +107327,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83602,31 +107346,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 537 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 + SolutionIndex: 680 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -83640,7 +107384,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -83657,32 +107401,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 2 + LVCB: 2 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -83693,10 +107437,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -83706,12 +107450,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -83737,6 +107481,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83746,6 +107491,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83764,16 +107510,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 538 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 + SolutionIndex: 681 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -83784,11 +107530,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [8, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -83808,7 +107554,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -83819,43 +107565,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 16 + LSPB: 16 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2144 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83863,13 +107613,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -83879,7 +107629,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -83895,6 +107645,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -83904,6 +107655,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83922,28 +107674,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 539 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU5_LPA2_LPB2_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 682 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 2 + SubGroup1: 8 + SubGroupA: 2 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [2, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -83966,7 +107718,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -83977,10 +107729,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -83994,15 +107746,11 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84026,7 +107774,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -84041,7 +107789,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -84057,6 +107805,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84066,6 +107815,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -84084,8 +107834,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 540 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 683 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84104,7 +107854,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -84122,7 +107872,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84139,7 +107889,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -84147,39 +107897,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -84187,13 +107937,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84219,6 +107969,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84228,6 +107979,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -84246,31 +107998,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 541 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 684 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [4, 4, 8] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84284,7 +108036,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84293,7 +108045,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -84301,47 +108053,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 8 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -84349,13 +108101,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84381,6 +108133,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84390,6 +108143,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -84408,31 +108162,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 542 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG32_8_1_WGM1 + SolutionIndex: 685 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 8] + SubGroup0: 8 + SubGroup1: 2 + SubGroupA: 8 + SubGroupB: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 2, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84446,7 +108200,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84466,29 +108220,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 + LVCA: 2 + LVCB: 2 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84499,10 +108253,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -84512,12 +108266,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84543,6 +108297,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84552,6 +108307,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -84570,14 +108326,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 543 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_1_WGM8 + SolutionIndex: 686 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] @@ -84590,11 +108346,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [8, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84608,7 +108364,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84625,7 +108381,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -84633,39 +108389,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -84673,13 +108429,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84705,6 +108461,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84714,6 +108471,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -84732,16 +108490,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 544 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 687 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT4_4_VW4_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -84752,11 +108510,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [4, 4, 8] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84790,7 +108548,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -84804,15 +108562,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84824,9 +108582,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -84835,11 +108593,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -84867,6 +108625,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -84876,6 +108635,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -84894,8 +108654,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 545 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 688 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84904,19 +108664,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -84932,7 +108692,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84949,32 +108709,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -84985,11 +108745,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -84997,13 +108757,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85029,6 +108789,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85038,6 +108799,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -85056,31 +108818,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 546 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 689 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -85102,7 +108864,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -85111,47 +108873,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6752 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85159,13 +108921,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 64 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85191,6 +108953,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85200,6 +108963,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -85218,29 +108982,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 547 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 690 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 2 + SubGroup1: 8 + SubGroupA: 2 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [2, 8, 4] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85265,7 +109029,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -85273,47 +109037,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 16 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85321,13 +109085,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85353,6 +109117,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85362,6 +109127,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -85380,16 +109146,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 548 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 691 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -85400,9 +109166,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85418,7 +109184,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -85435,7 +109201,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -85443,39 +109209,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85483,13 +109249,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85515,6 +109281,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85524,6 +109291,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -85542,16 +109310,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 549 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 692 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -85562,11 +109330,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [4, 4, 8] + WorkGroupMapping: 64 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -85597,7 +109365,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -85614,15 +109382,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85634,9 +109402,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -85645,11 +109413,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -85668,6 +109436,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85677,6 +109446,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85686,6 +109456,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -85704,8 +109475,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 550 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM8 + SolutionIndex: 693 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85713,20 +109484,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85749,7 +109520,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -85776,15 +109547,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85796,10 +109567,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85807,12 +109578,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -85830,6 +109603,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85839,6 +109613,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -85848,6 +109623,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -85866,8 +109642,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 551 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 694 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85876,23 +109652,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85911,7 +109685,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -85938,15 +109712,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -85958,9 +109732,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -85969,11 +109743,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -85992,6 +109768,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86001,6 +109778,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86010,6 +109788,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -86028,8 +109807,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 552 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_8_VW2_WG16_16_1_WGM64 + SolutionIndex: 695 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86038,10 +109817,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -86050,11 +109829,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86083,7 +109860,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -86100,15 +109877,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86120,9 +109897,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -86131,11 +109908,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -86154,6 +109931,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86163,6 +109941,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86172,6 +109951,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -86190,8 +109970,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 553 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM64 + SolutionIndex: 696 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86199,20 +109979,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -86245,7 +110025,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -86262,15 +110042,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86283,9 +110063,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86293,12 +110073,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -86316,6 +110096,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86325,6 +110106,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86334,6 +110116,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -86352,8 +110135,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 554 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 697 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86361,12 +110144,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] + SuppressNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -86374,7 +110157,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -86397,7 +110180,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -86407,32 +110190,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86444,10 +110227,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86455,13 +110238,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -86478,6 +110263,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86487,6 +110273,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86496,6 +110283,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -86514,33 +110302,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 555 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 + SolutionIndex: 698 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86559,7 +110345,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -86569,10 +110355,10 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -86586,15 +110372,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6240 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86606,10 +110392,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86617,12 +110403,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -86640,6 +110428,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86649,6 +110438,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86658,6 +110448,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -86676,8 +110467,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 556 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 699 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86685,24 +110476,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86714,7 +110503,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86731,32 +110520,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 9280 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86767,10 +110556,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -86779,12 +110568,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -86802,6 +110591,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86811,6 +110601,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86820,6 +110611,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -86838,31 +110630,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 557 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 + SolutionIndex: 700 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -86876,7 +110668,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -86896,29 +110688,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14464 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 4160 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -86929,10 +110721,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -86941,12 +110733,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -86964,6 +110756,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86973,6 +110766,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -86982,6 +110776,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -87000,31 +110795,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 558 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG8_32_1_WGM8 + SolutionIndex: 701 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -87038,14 +110833,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -87055,32 +110850,32 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87091,11 +110886,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87103,8 +110898,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -87126,6 +110923,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87135,6 +110933,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87144,6 +110943,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -87162,8 +110962,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 559 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 + SolutionIndex: 702 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87171,24 +110971,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87200,16 +110998,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -87225,24 +111023,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 32 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 32 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3424 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 320 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87253,11 +111051,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87265,12 +111063,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -87288,6 +111088,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87297,6 +111098,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87306,6 +111108,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -87324,33 +111127,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 560 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 703 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87362,7 +111163,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -87371,7 +111172,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -87387,24 +111188,24 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 + LSCA: 16 + LSCB: 16 + LSPA: 64 LSPB: 64 - LVCA: 2 + LVCA: 4 LVCB: 4 - LVPA: 32 - LVPB: 32 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3680 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87415,11 +111216,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87427,12 +111228,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -87450,6 +111251,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87459,6 +111261,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87468,6 +111271,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -87486,31 +111290,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 561 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 704 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -87551,22 +111355,22 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -87578,10 +111382,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87589,13 +111393,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -87612,6 +111416,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87621,6 +111426,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87630,6 +111436,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -87648,29 +111455,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 562 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 705 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -87693,16 +111500,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -87714,20 +111521,20 @@ LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 64 + LSPB: 32 LVCA: 8 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 576 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -87740,9 +111547,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -87751,12 +111558,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -87774,6 +111583,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87783,6 +111593,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87792,6 +111603,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -87810,33 +111622,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 563 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 + SolutionIndex: 706 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87855,16 +111665,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -87875,21 +111685,21 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -87903,9 +111713,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87913,12 +111723,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -87936,6 +111748,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -87945,6 +111758,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -87954,6 +111768,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -87972,8 +111787,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 564 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 707 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87982,11 +111797,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -87997,8 +111812,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88018,15 +111831,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -88037,21 +111850,21 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -88064,10 +111877,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88080,7 +111893,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -88098,6 +111911,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88107,6 +111921,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88116,6 +111931,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88134,8 +111950,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 565 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 708 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88144,13 +111960,13 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -88180,7 +111996,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -88188,7 +112004,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -88199,21 +112015,21 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 32 - LVCA: 4 + LVCA: 8 LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true @@ -88226,10 +112042,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88237,12 +112053,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -88260,6 +112076,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88269,6 +112086,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88278,6 +112096,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88296,28 +112115,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 566 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 + SolutionIndex: 709 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -88354,7 +112173,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -88368,15 +112187,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -88388,10 +112207,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88400,11 +112219,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -88422,6 +112241,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88431,6 +112251,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88440,6 +112261,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88458,15 +112280,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 567 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM1 + SolutionIndex: 710 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 @@ -88478,8 +112300,8 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -88502,39 +112324,43 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4736 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 4160 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -88546,10 +112372,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 256 - MacroTile1: 32 - MacroTileA: 256 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88558,11 +112384,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 8 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -88573,13 +112401,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88589,6 +112418,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88598,6 +112428,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88616,33 +112447,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 568 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT8_4_VW4_WG32_8_1_WGM1 + SolutionIndex: 711 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88661,17 +112490,17 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 3 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -88681,20 +112510,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -88708,10 +112537,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88721,10 +112550,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -88742,6 +112573,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88751,6 +112583,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88760,6 +112593,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88778,8 +112612,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 569 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA4_LPB4_PGR1_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 712 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88787,14 +112621,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -88803,8 +112637,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88823,17 +112655,17 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -88843,20 +112675,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -88871,9 +112703,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88881,13 +112713,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88904,6 +112738,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -88913,6 +112748,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -88922,6 +112758,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -88940,33 +112777,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 570 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 713 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88985,42 +112820,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 7264 LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -89044,11 +112879,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -89066,6 +112903,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89075,6 +112913,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89084,6 +112923,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89102,33 +112942,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 571 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 + SolutionIndex: 714 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89147,42 +112985,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6272 + LdsNumElements: 7264 LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -89195,9 +113033,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89205,12 +113043,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -89228,6 +113068,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89237,6 +113078,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89246,6 +113088,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89264,8 +113107,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 572 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 715 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -89274,23 +113117,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89310,41 +113151,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 7264 LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1088 LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -89368,11 +113209,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -89390,6 +113231,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89399,6 +113241,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89408,6 +113251,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89426,8 +113270,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 573 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 716 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -89442,13 +113286,13 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -89472,41 +113316,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7264 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -89518,10 +113362,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89530,11 +113374,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -89552,6 +113396,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89561,6 +113406,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89570,6 +113416,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89588,8 +113435,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 574 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 717 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -89598,19 +113445,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -89633,16 +113480,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -89653,20 +113500,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2112 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -89680,9 +113527,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -89691,12 +113538,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -89714,6 +113563,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89723,6 +113573,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89732,6 +113583,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89750,8 +113602,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 575 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 718 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -89760,23 +113612,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89794,9 +113644,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -89804,7 +113654,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -89815,16 +113665,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 32 - LVCA: 4 + LVCA: 8 LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2688 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 2112 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -89838,10 +113692,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89849,12 +113703,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -89865,13 +113721,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -89881,6 +113738,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -89890,6 +113748,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -89908,33 +113767,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 576 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 719 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89954,7 +113811,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -89962,7 +113819,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -89973,20 +113830,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 32 - LVCA: 4 + LVCA: 8 LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -90000,10 +113857,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90011,12 +113868,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -90034,6 +113891,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90043,6 +113901,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90052,6 +113911,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90070,29 +113930,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 577 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 + SolutionIndex: 720 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -90116,15 +113976,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -90135,20 +113995,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -90162,10 +114022,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90178,7 +114038,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -90196,6 +114056,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90205,6 +114066,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90214,6 +114076,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90232,15 +114095,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 578 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 + SolutionIndex: 721 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 @@ -90253,8 +114116,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -90277,17 +114140,17 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -90299,18 +114162,18 @@ LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -90324,10 +114187,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90335,13 +114198,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90358,6 +114223,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90367,6 +114233,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90376,6 +114243,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90394,33 +114262,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 579 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 722 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90439,16 +114305,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -90461,18 +114327,18 @@ LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 3136 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -90486,10 +114352,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90497,13 +114363,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -90520,6 +114388,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90529,6 +114398,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90538,6 +114408,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90556,20 +114427,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 580 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 723 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -90577,12 +114448,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90603,14 +114472,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -90622,19 +114491,19 @@ LSCA: 16 LSCB: 16 LSPA: 32 - LSPB: 64 + LSPB: 32 LVCA: 8 - LVCB: 4 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -90648,10 +114517,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90659,11 +114528,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -90682,6 +114551,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90691,6 +114561,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90700,6 +114571,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90718,29 +114590,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 581 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 724 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -90763,16 +114635,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -90783,20 +114655,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -90810,10 +114682,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90823,9 +114695,11 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -90844,6 +114718,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -90853,6 +114728,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -90862,6 +114738,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -90880,15 +114757,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 582 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 725 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [8, 4] ThreadTile0: 8 @@ -90901,12 +114778,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90926,15 +114801,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -90945,20 +114820,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2112 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -90972,9 +114847,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -90983,12 +114858,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -91006,6 +114881,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91015,6 +114891,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91024,6 +114901,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91042,8 +114920,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 583 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 726 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91052,10 +114930,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -91064,7 +114942,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -91087,7 +114965,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -91147,6 +115025,8 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -91168,6 +115048,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91177,6 +115058,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91186,6 +115068,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91204,8 +115087,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 584 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 727 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91220,17 +115103,15 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -91277,174 +115158,12 @@ LVPB: 16 LdcEqualsLdd: false LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesSummation: 1 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: true - TransposeB: false - UseBeta: true - UseInitialStrides: false - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 585 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM64 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 - UnrollMemFence: false - UseSgprForGRO: 1 - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 - WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 16 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 12416 - LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 10304 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 LdsPadA: 4 LdsPadB: 4 LocalDotLayout: 1 @@ -91458,9 +115177,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91469,11 +115188,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -91492,6 +115211,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91501,6 +115221,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91510,6 +115231,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91528,8 +115250,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 586 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 728 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91538,10 +115260,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -91550,7 +115272,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -91572,10 +115294,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -91586,29 +115308,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 32 LVCA: 4 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91620,9 +115338,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91631,13 +115349,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91647,13 +115365,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91663,6 +115382,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91672,6 +115392,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91690,29 +115411,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 587 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM64 + SolutionIndex: 729 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB0_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -91734,43 +115455,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91782,10 +115499,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91793,13 +115510,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91809,13 +115528,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91825,6 +115545,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91834,6 +115555,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -91852,33 +115574,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 588 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 + SolutionIndex: 730 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 32 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -91896,43 +115616,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 576 - LdsOffsetB_Blk: 4672 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -91945,9 +115661,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91955,13 +115671,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -91971,13 +115689,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -91987,6 +115706,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -91996,6 +115716,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92014,33 +115735,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 589 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM64 + SolutionIndex: 731 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92052,14 +115771,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -92072,29 +115791,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13568 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92105,11 +115820,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92118,11 +115833,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -92133,13 +115850,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92149,6 +115867,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92158,6 +115877,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92176,16 +115896,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 590 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 + SolutionIndex: 732 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -92196,13 +115916,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92214,13 +115932,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -92234,29 +115952,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92267,7 +115981,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -92280,11 +115994,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -92295,13 +116009,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92311,6 +116026,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92320,6 +116036,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92338,8 +116055,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 591 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 733 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92347,7 +116064,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -92358,11 +116075,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -92376,14 +116093,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -92393,32 +116110,28 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 5 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 13568 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92429,11 +116142,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92441,13 +116154,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92457,13 +116172,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92473,6 +116189,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92482,6 +116199,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92500,33 +116218,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 592 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM1 + SolutionIndex: 734 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 32, 1] + VectorWidth: 1 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92538,13 +116254,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -92558,29 +116274,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92591,11 +116303,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92603,13 +116315,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92619,13 +116331,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92635,6 +116348,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92644,6 +116358,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92662,31 +116377,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 593 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 735 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -92700,14 +116415,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -92720,29 +116435,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92753,11 +116464,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92765,13 +116476,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92781,13 +116494,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92797,6 +116511,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92806,6 +116521,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92824,33 +116540,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 594 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 736 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92862,13 +116576,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -92882,29 +116596,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 8 - LVCB: 8 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -92915,11 +116625,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92927,13 +116637,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92943,13 +116653,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -92959,6 +116670,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -92968,6 +116680,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -92986,31 +116699,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 595 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 737 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -93024,14 +116737,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -93044,29 +116757,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93077,11 +116786,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93089,12 +116798,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -93105,13 +116816,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93121,6 +116833,7 @@ Index1: 1 IndexAssignmentsA: [3, 0, 2] IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 0 IndexUnrollB: 0 @@ -93130,6 +116843,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -93148,8 +116862,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 596 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 + SolutionIndex: 738 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -93157,24 +116871,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93192,10 +116904,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -93206,7 +116918,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -93214,21 +116926,17 @@ LSCA: 16 LSCB: 16 LSPA: 64 - LSPB: 32 + LSPB: 64 LVCA: 4 - LVCB: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2128 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93241,9 +116949,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93251,8 +116959,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -93267,13 +116975,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93293,8 +117002,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -93312,8 +117021,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 597 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM1 + SolutionIndex: 739 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -93321,20 +117030,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -93356,8 +117065,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -93370,29 +117079,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 + LSPA: 32 + LSPB: 32 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93404,10 +117109,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93415,13 +117120,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -93431,13 +117138,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93457,8 +117165,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -93476,33 +117184,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 598 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 740 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93514,16 +117220,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -93539,39 +117245,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93579,8 +117281,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -93595,13 +117297,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93621,8 +117324,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -93640,31 +117343,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 599 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM1 + SolutionIndex: 741 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -93678,14 +117381,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -93698,29 +117401,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 2 - LVCB: 2 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1104 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -93731,7 +117430,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -93743,13 +117442,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -93759,13 +117460,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93785,8 +117487,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -93804,33 +117506,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 600 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM8 + SolutionIndex: 742 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 1] + VectorWidth: 1 + WorkGroup: [8, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93842,16 +117542,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -93862,44 +117562,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 4 + LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 1 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 + LoopUnroll: 32 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93907,12 +117603,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -93923,13 +117621,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -93949,8 +117648,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -93968,33 +117667,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 601 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW2_WG4_4_8_WGM8 + SolutionIndex: 743 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 8 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94006,13 +117703,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 @@ -94026,44 +117723,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 864 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94071,8 +117764,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -94087,13 +117780,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94113,8 +117807,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -94132,31 +117826,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 602 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM8 + SolutionIndex: 744 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -94170,16 +117864,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94190,44 +117884,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94235,11 +117925,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 @@ -94251,13 +117943,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94277,8 +117970,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -94296,33 +117989,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 603 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM16 + SolutionIndex: 745 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94334,13 +118025,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -94354,29 +118045,25 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 16 + LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 2 - LVCB: 2 + LVCA: 4 + LVCB: 4 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -94387,10 +118074,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -94400,12 +118087,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -94415,13 +118102,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94441,8 +118129,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -94460,16 +118148,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 604 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM64 + SolutionIndex: 746 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -94480,11 +118168,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -94498,16 +118186,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94518,44 +118206,40 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 576 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 8 - MacroTileA: 16 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94563,12 +118247,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -94579,13 +118263,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94605,8 +118290,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -94624,31 +118309,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 605 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM64 + SolutionIndex: 747 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 2] + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -94662,13 +118347,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -94679,47 +118364,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1664 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94727,8 +118408,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -94743,13 +118424,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94769,8 +118451,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -94788,37 +118470,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 606 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT8_4_VW2_WG4_4_8_WGM64 + SolutionIndex: 748 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -94826,7 +118508,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94835,7 +118517,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94846,44 +118528,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 8 - LVCA: 2 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94891,13 +118574,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -94907,6 +118590,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -94914,6 +118598,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -94933,8 +118618,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -94952,37 +118637,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 607 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM1 + SolutionIndex: 749 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT6_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 2 - SubGroupA: 8 - SubGroupB: 2 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 2, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 32 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -94990,15 +118675,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -95006,7 +118691,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -95014,23 +118699,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 128 - LSPB: 64 - LVCA: 2 - LVCB: 4 - LVPA: 32 - LVPB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -95043,11 +118729,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95055,12 +118741,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -95071,6 +118759,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -95078,6 +118767,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95097,8 +118787,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -95116,8 +118806,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 608 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 750 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -95125,12 +118815,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] + SuppressNoLoadLoop: false + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -95138,15 +118828,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95154,14 +118842,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -95178,23 +118866,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -95207,11 +118896,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95219,13 +118908,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95235,13 +118926,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95261,8 +118954,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -95280,37 +118973,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 609 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM1 + SolutionIndex: 751 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95327,7 +119018,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -95338,43 +119029,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 4 - LVPB: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -95384,12 +119076,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95399,6 +119091,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -95406,6 +119099,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95425,8 +119119,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -95444,29 +119138,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 610 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM1 + SolutionIndex: 752 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 2 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [2, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -95474,7 +119168,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95488,9 +119182,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -95506,35 +119200,40 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 16 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 - LVPA: 16 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -95543,8 +119242,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -95559,13 +119258,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95585,8 +119286,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -95604,14 +119305,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 611 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 753 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -95625,8 +119326,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -95634,7 +119335,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95642,39 +119343,40 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -95688,18 +119390,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95707,13 +119409,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95723,6 +119428,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -95730,6 +119436,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95749,8 +119456,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -95768,16 +119475,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 612 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM1 + SolutionIndex: 754 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -95789,16 +119496,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 4, 8] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95813,8 +119518,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -95822,7 +119527,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -95830,40 +119535,41 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 8 - LVCA: 2 + LSPA: 64 + LSPB: 64 + LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3168 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95871,13 +119577,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -95887,6 +119596,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -95894,6 +119604,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -95913,8 +119624,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -95932,16 +119643,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 613 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM8 + SolutionIndex: 755 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 2 - SubGroupA: 8 - SubGroupB: 2 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -95953,16 +119664,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 2, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95970,14 +119679,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -95990,29 +119699,26 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2144 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -96023,11 +119729,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96036,12 +119742,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96051,13 +119760,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96077,8 +119788,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -96096,16 +119807,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 614 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM8 + SolutionIndex: 756 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -96116,17 +119827,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96134,14 +119843,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -96151,47 +119860,44 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96199,13 +119905,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96215,13 +119924,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96241,8 +119952,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -96260,15 +119971,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 615 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT4_4_VW4_WG4_4_8_WGM8 + SolutionIndex: 757 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -96281,16 +119992,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 4, 8] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96322,6 +120031,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -96370,6 +120080,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96379,6 +120090,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -96386,6 +120098,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96405,8 +120118,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -96424,8 +120137,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 616 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 758 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -96433,7 +120146,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -96446,7 +120159,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -96454,7 +120167,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96462,13 +120175,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -96486,25 +120199,22 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 32 - LSPB: 32 - LVCA: 2 - LVCB: 2 - LVPA: 8 - LVPB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetB: 1088 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -96515,11 +120225,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96533,7 +120243,8 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96543,13 +120254,15 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96569,8 +120282,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -96588,16 +120301,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 617 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM64 + SolutionIndex: 759 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -96609,16 +120322,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96635,7 +120348,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96650,39 +120363,40 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 64 + LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 4 - LVPB: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -96696,8 +120410,9 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96707,6 +120422,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -96714,6 +120430,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96733,8 +120450,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -96752,20 +120469,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 618 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM64 + SolutionIndex: 760 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 2 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 2 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -96773,8 +120490,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [2, 8, 4] - WorkGroupMapping: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -96782,7 +120499,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96790,64 +120507,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 16 - LVCA: 4 - LVCB: 8 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96855,13 +120573,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -96871,6 +120592,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -96878,6 +120600,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -96897,8 +120620,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -96916,37 +120639,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 619 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG4_4_8_WGM64 + SolutionIndex: 761 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_LPA0_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 4 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96961,8 +120682,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -96974,44 +120695,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 320 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 320 + LdsOffsetB_Blk: 2368 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97019,13 +120741,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97035,6 +120760,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97042,6 +120768,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -97061,8 +120788,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -97080,37 +120807,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 620 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM64 + SolutionIndex: 762 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 4 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [4, 4, 8] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97118,64 +120843,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97183,13 +120909,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97199,6 +120928,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97226,8 +120956,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -97245,37 +120975,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 621 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 763 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97283,7 +121011,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -97292,7 +121020,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -97303,44 +121031,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97348,15 +121077,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97366,6 +121096,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97393,8 +121124,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -97412,35 +121143,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 622 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 764 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97455,57 +121186,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 8 LVPA: 16 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97513,15 +121245,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97531,6 +121262,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97558,8 +121290,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -97577,35 +121309,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 623 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 765 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x16_SE_EPS1_FL1_GRVW2_LPA0_LPB0_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97620,9 +121354,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -97637,40 +121371,41 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97678,13 +121413,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97694,6 +121432,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97721,8 +121460,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -97740,37 +121479,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 624 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 766 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97778,7 +121515,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -97787,7 +121524,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -97802,23 +121539,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 96 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 24 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -97831,11 +121569,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97843,13 +121581,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -97859,6 +121598,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -97886,8 +121626,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -97905,8 +121645,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 625 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 767 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97914,12 +121654,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -97929,13 +121669,13 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -97943,7 +121683,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -97951,7 +121691,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -97967,23 +121707,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -97996,10 +121737,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -98008,15 +121749,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98026,8 +121768,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -98053,8 +121796,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -98072,8 +121815,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 626 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 768 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98081,14 +121824,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -98096,11 +121839,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98108,7 +121851,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -98116,7 +121859,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -98132,23 +121875,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 - LSPB: 64 + LSPB: 128 LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98161,10 +121905,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -98173,15 +121917,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98191,6 +121936,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98218,8 +121964,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -98237,8 +121983,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 627 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 769 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98246,11 +121992,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 @@ -98261,11 +122007,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98273,16 +122019,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -98297,23 +122043,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98326,11 +122073,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98338,13 +122085,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98354,8 +122104,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -98381,8 +122132,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -98400,8 +122151,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 628 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 770 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98409,28 +122160,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98445,7 +122194,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -98462,6 +122211,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -98505,11 +122255,14 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98519,6 +122272,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98546,8 +122300,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -98565,8 +122319,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 629 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 771 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98574,7 +122328,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 @@ -98590,12 +122344,10 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98603,14 +122355,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -98627,23 +122379,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98656,7 +122409,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 128 MacroTileA: 128 @@ -98670,13 +122423,12 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98686,6 +122438,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98713,8 +122466,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -98732,8 +122485,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 630 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 772 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98741,26 +122494,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 8] ThreadTile0: 8 ThreadTile1: 8 ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98792,6 +122547,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -98802,13 +122558,13 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98822,10 +122578,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98833,15 +122589,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -98851,6 +122608,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -98878,8 +122636,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -98897,8 +122655,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 631 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 773 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98906,12 +122664,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -98925,7 +122683,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -98933,7 +122691,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -98942,7 +122700,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -98957,23 +122715,24 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98986,11 +122745,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98998,13 +122757,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99014,6 +122774,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99041,8 +122802,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -99060,8 +122821,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 632 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 774 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -99069,28 +122830,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99106,7 +122867,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -99122,40 +122883,41 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99163,13 +122925,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99179,6 +122942,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99206,8 +122970,8 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false TLUA: false @@ -99225,29 +122989,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 633 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 775 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -99255,7 +123019,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99283,10 +123047,11 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -99297,15 +123062,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99318,25 +123083,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99346,6 +123114,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99373,9 +123142,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -99392,8 +123162,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 634 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 776 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -99401,18 +123171,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -99420,7 +123190,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99452,6 +123222,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -99462,15 +123233,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99483,25 +123254,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99511,6 +123285,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99538,9 +123313,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -99557,8 +123333,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 635 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 777 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -99566,12 +123342,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -99585,7 +123361,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99600,16 +123376,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -99617,25 +123393,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99648,23 +123425,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99674,6 +123456,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99701,9 +123484,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -99720,8 +123504,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 636 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 778 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -99729,14 +123513,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -99745,12 +123529,10 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99765,16 +123547,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -99782,25 +123564,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99813,23 +123596,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -99839,6 +123627,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -99866,9 +123655,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -99885,8 +123675,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 637 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 779 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -99894,12 +123684,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -99907,15 +123697,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -99931,41 +123719,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7200 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -99978,23 +123767,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100004,6 +123796,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100031,9 +123824,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -100050,8 +123844,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 638 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 780 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -100059,18 +123853,18 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -100080,7 +123874,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100095,16 +123889,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -100112,25 +123906,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100143,25 +123938,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100171,6 +123967,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100198,9 +123995,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -100217,8 +124015,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 639 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 781 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -100226,12 +124024,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -100239,13 +124037,15 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100260,16 +124060,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -100277,25 +124077,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100308,25 +124109,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100336,6 +124138,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100363,9 +124166,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -100382,8 +124186,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 640 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 782 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -100391,14 +124195,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -100407,10 +124211,12 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100424,17 +124230,17 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -100442,25 +124248,22 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100473,25 +124276,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100501,7 +124305,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -100528,9 +124333,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -100547,8 +124353,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 641 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 783 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_PGR0_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -100556,12 +124362,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -100569,13 +124375,15 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100607,6 +124415,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -100617,15 +124426,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100638,25 +124447,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100666,6 +124478,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100693,9 +124506,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -100712,8 +124526,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 642 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 784 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -100721,14 +124535,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -100740,7 +124554,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100772,6 +124586,7 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 @@ -100782,15 +124597,15 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100803,25 +124618,28 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100831,6 +124649,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -100858,9 +124677,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -100877,8 +124697,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 643 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 785 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -100886,12 +124706,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SuppressNoLoadLoop: false + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -100899,13 +124719,13 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100913,49 +124733,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -100966,7 +124787,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -100974,17 +124795,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -100994,8 +124820,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -101021,9 +124848,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -101040,8 +124868,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 644 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 786 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101049,28 +124877,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101085,42 +124911,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7264 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101139,17 +124966,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101159,8 +124991,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -101186,9 +125019,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -101205,8 +125039,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 645 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 787 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101214,7 +125048,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 @@ -101225,17 +125059,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101251,15 +125083,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -101267,25 +125099,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101304,19 +125137,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101326,6 +125162,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101353,9 +125190,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -101372,8 +125210,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 646 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 788 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101381,14 +125219,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -101400,7 +125238,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101416,15 +125254,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -101432,25 +125270,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101469,19 +125308,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101491,6 +125333,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101518,9 +125361,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -101537,8 +125381,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 647 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 789 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101546,7 +125390,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 @@ -101559,13 +125403,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101573,7 +125417,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -101597,25 +125441,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101626,7 +125471,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -101634,7 +125479,9 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -101642,9 +125489,10 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101654,6 +125502,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101681,9 +125530,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -101700,8 +125550,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 648 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 790 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101709,14 +125559,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -101724,13 +125574,13 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101746,41 +125596,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101799,17 +125650,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101819,6 +125673,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -101846,9 +125701,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -101865,8 +125721,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 649 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 791 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101874,7 +125730,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 @@ -101885,9 +125741,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -101895,7 +125751,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101903,23 +125759,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -101927,25 +125783,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -101956,27 +125813,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -101986,8 +125844,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -102013,9 +125872,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -102032,8 +125892,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 650 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 792 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -102041,26 +125901,28 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102075,16 +125937,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -102092,25 +125954,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102122,26 +125985,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102151,6 +126015,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102178,9 +126043,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -102197,8 +126063,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 651 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 793 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -102206,12 +126072,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -102222,10 +126088,12 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102241,15 +126109,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -102257,25 +126125,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102287,14 +126156,16 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 @@ -102302,9 +126173,10 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102314,6 +126186,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -102341,9 +126214,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -102360,8 +126234,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 652 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 794 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -102369,20 +126243,20 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -102390,7 +126264,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102398,7 +126272,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102422,25 +126296,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102451,7 +126326,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -102459,19 +126334,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102481,8 +126359,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -102508,9 +126387,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -102527,8 +126407,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 653 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 795 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -102536,7 +126416,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -102549,13 +126429,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102563,14 +126443,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -102587,25 +126467,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2112 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2112 - LdsOffsetB_Blk: 6208 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102616,7 +126497,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -102624,17 +126505,22 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102644,8 +126530,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -102671,9 +126558,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -102690,8 +126578,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 654 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 796 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -102699,7 +126587,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -102714,13 +126602,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102728,7 +126614,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102737,7 +126623,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -102748,29 +126634,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102781,27 +126668,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102811,8 +126701,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -102838,9 +126729,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -102857,8 +126749,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 655 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 797 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -102866,26 +126758,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102893,16 +126785,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -102917,25 +126809,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -102946,25 +126839,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -102974,8 +126872,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -103001,9 +126900,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -103020,8 +126920,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 656 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 798 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -103029,12 +126929,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -103042,15 +126942,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103064,8 +126962,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -103078,24 +126976,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true @@ -103108,24 +127011,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103135,7 +127043,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103162,9 +127071,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -103181,37 +127091,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 657 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB0_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 799 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103219,45 +127127,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -103268,27 +127181,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103298,8 +127212,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -103325,9 +127240,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -103344,35 +127260,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 658 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 800 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103380,45 +127298,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -103429,27 +127352,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103459,8 +127383,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -103486,9 +127411,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -103505,35 +127431,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 659 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 801 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103541,45 +127469,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 8 + LSCB: 8 LSPA: 64 LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -103590,27 +127523,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103620,7 +127554,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -103647,9 +127582,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -103666,8 +127602,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 660 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 802 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -103676,25 +127612,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103702,16 +127640,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103722,25 +127660,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 64 + LSCA: 8 + LSCB: 8 + LSPA: 128 LSPB: 64 - LVCA: 4 + LVCA: 2 LVCB: 4 - LVPA: 16 - LVPB: 16 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -103751,25 +127694,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103779,8 +127725,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -103806,9 +127753,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -103825,8 +127773,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 661 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 803 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -103835,27 +127783,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -103863,16 +127811,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -103883,25 +127831,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -103912,27 +127865,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -103942,8 +127896,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -103969,9 +127924,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -103988,35 +127944,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 662 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 804 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104024,16 +127982,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -104044,25 +128002,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104073,25 +128036,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104101,8 +128067,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104128,9 +128095,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -104147,37 +128115,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 663 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 805 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104185,16 +128153,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -104205,25 +128173,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 4 + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 2 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 32 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104234,27 +128207,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104264,7 +128238,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104291,9 +128266,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -104310,35 +128286,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 664 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 + SolutionIndex: 806 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104352,39 +128330,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104396,24 +128379,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104423,8 +128411,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104450,9 +128439,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -104469,37 +128459,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 665 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 + SolutionIndex: 807 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104513,39 +128501,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104557,26 +128550,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104586,8 +128582,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104613,9 +128610,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -104632,8 +128630,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 666 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 808 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -104642,17 +128640,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -104660,7 +128658,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104674,39 +128672,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2128 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104718,24 +128721,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104745,8 +128753,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -104772,9 +128781,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -104791,8 +128801,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 667 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 809 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -104801,27 +128811,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104835,39 +128843,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -104879,26 +128892,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -104908,7 +128924,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -104935,9 +128952,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -104954,35 +128972,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 668 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 810 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -104996,39 +129014,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105040,24 +129063,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105067,7 +129095,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105094,9 +129123,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -105113,37 +129143,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 669 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 + SolutionIndex: 811 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105157,7 +129185,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -105171,25 +129199,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 + LSPA: 64 + LSPB: 64 LVCA: 4 LVCB: 4 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1104 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105201,26 +129234,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105230,7 +129266,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105257,9 +129294,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -105276,35 +129314,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 670 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM8 + SolutionIndex: 812 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105312,13 +129350,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -105332,25 +129370,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2144 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 1 - LdsPadB: 1 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105361,27 +129404,30 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105391,7 +129437,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105418,9 +129465,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -105437,35 +129485,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 671 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 + SolutionIndex: 813 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105473,15 +129521,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -105489,29 +129537,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 64 + LSCA: 16 + LSCB: 16 + LSPA: 32 LSPB: 32 - LVCA: 2 - LVCB: 4 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 864 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105522,25 +129575,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105550,7 +129606,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -105577,9 +129634,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -105596,37 +129654,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 672 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 814 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105640,39 +129698,44 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105684,26 +129747,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105713,7 +129777,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -105740,9 +129805,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -105759,35 +129825,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 673 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 815 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105801,17 +129869,17 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -105819,21 +129887,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -105845,24 +129918,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -105872,8 +129948,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -105899,9 +129976,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -105918,29 +129996,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 674 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 816 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -105948,7 +130026,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105962,17 +130040,17 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -105980,21 +130058,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 576 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -106006,24 +130089,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106033,7 +130119,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106060,9 +130147,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106079,28 +130167,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 675 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 + SolutionIndex: 817 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -106109,7 +130197,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -106123,17 +130211,17 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -106141,21 +130229,26 @@ GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 32 LSPB: 32 - LVCA: 4 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -106167,24 +130260,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106194,7 +130290,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -106221,9 +130318,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106240,28 +130338,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 676 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 818 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -106313,13 +130411,13 @@ LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -106333,24 +130431,27 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106362,7 +130463,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -106388,9 +130489,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106407,8 +130509,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 677 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT6_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 819 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -106417,11 +130519,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -106429,10 +130531,10 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 32 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -106452,16 +130554,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -106473,10 +130575,10 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false @@ -106507,19 +130609,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106531,7 +130634,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -106557,9 +130660,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106576,8 +130680,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 678 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 820 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -106601,6 +130705,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106619,7 +130725,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106674,19 +130780,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106698,7 +130805,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -106724,9 +130831,181 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 821 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106743,8 +131022,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 679 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 + SolutionIndex: 822 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -106768,6 +131047,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106786,9 +131067,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -106799,7 +131080,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -106807,20 +131088,20 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 64 - LSPB: 32 - LVCA: 4 - LVCB: 8 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -106834,24 +131115,29 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -106889,9 +131175,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -106908,15 +131195,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 680 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 + SolutionIndex: 823 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -106928,13 +131215,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106953,7 +131238,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -106974,51 +131259,56 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 + LSPA: 32 LSPB: 64 LVCA: 8 LVCB: 4 - LVPA: 8 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -107056,9 +131346,10 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 - NumIndicesSummation: 1 NumIndicesLD: 4 + NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -107075,14 +131366,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 681 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG4_16_4_WGM8 + SolutionIndex: 824 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -107096,12 +131387,181 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 16, 4] + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 825 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [1024, 128, 1, 128] - [12, 896.219] @@ -110025,8 +134485,6 @@ - [231, 6307.6] - - [1024, 512, 1, 4608] - [242, 7953.38] - - - [2048, 256, 1, 768] - - [242, 7059.14] - - [4096, 200, 1, 32] - [191, 2199.19] - - [4096, 200, 1, 3328] @@ -112697,5598 +137155,6108 @@ - [336, 6145.5] - - [1024, 3712, 1, 1024] - [338, 8933.88] + - - [256, 256, 192, 64] + - [343, 8264.64] + - - [768, 4096, 1, 768] + - [356, 9642.08] + - - [768, 64, 1, 768] + - [353, 1850.43] + - - [768, 1280, 1, 768] + - [356, 8738.13] + - - [30522, 320, 1, 768] + - [357, 9733.59] + - - [128, 128, 96, 64] + - [346, 5470.83] + - - [2, 16, 1, 768] + - [349, 2.47742] + - - [30522, 1280, 1, 768] + - [355, 10127.9] + - - [30522, 640, 1, 768] + - [356, 9987.61] + - - [2, 8, 1, 768] + - [348, 0.96] + - - [768, 4096, 1, 3072] + - [358, 9479.41] + - - [768, 32, 1, 768] + - [352, 880.334] + - - [2, 64, 1, 768] + - [349, 9.99024] + - - [256, 256, 96, 64] + - [343, 7614.47] + - - [64, 64, 768, 64] + - [345, 5354.43] + - - [30522, 160, 1, 768] + - [354, 7740.11] + - - [768, 320, 1, 768] + - [347, 5423.67] + - - [128, 128, 384, 64] + - [344, 7179.98] + - - [768, 16, 1, 768] + - [350, 706.376] + - - [3072, 4096, 1, 768] + - [359, 9961.74] + - - [2048, 512, 1, 100] + - [361, 5180.71] + - - [1024, 200, 1, 560] + - [362, 4061.19] + - - [256, 1280, 1, 1024] + - [369, 4337.44] + - - [256, 44505, 1, 1024] + - [405, 8597.69] + - - [10240, 8976, 1, 256] + - [408, 9471.43] + - - [256, 7168, 1, 1024] + - [399, 6718.56] + - - [8448, 8976, 1, 256] + - [391, 9601.31] + - - [18944, 8976, 1, 256] + - [400, 9666.26] + - - [256, 19200, 1, 1024] + - [376, 7488.94] + - - [5632, 8976, 1, 256] + - [388, 9358.39] + - - [256, 23552, 1, 1024] + - [403, 7980.89] + - - [256, 6656, 1, 1024] + - [403, 6287.22] + - - [256, 14336, 1, 1024] + - [398, 7049.26] + - - [256, 12544, 1, 1024] + - [376, 6728.47] + - - [2048, 684, 1, 768] + - [393, 8479.18] + - - [5376, 8976, 1, 256] + - [388, 9519.51] + - - [256, 5888, 1, 1024] + - [408, 6012.4] + - - [19968, 8976, 1, 256] + - [400, 9684.67] + - - [3840, 8976, 1, 256] + - [385, 9461.89] + - - [4608, 8976, 1, 256] + - [385, 9305.82] + - - [256, 684, 1, 1024] + - [411, 3513.06] + - - [256, 22016, 1, 1024] + - [376, 7643.79] + - - [256, 23296, 1, 1024] + - [405, 8048.12] + - - [4864, 8976, 1, 256] + - [383, 9545.62] + - - [256, 7424, 1, 1024] + - [401, 6770.65] + - - [18176, 8976, 1, 256] + - [408, 9729.47] + - - [256, 15104, 1, 1024] + - [397, 7289.08] + - - [8192, 8976, 1, 256] + - [400, 9395.49] + - - [256, 16128, 1, 1024] + - [400, 7461.28] + - - [13312, 8976, 1, 256] + - [408, 9550.97] + - - [256, 21504, 1, 1024] + - [405, 7635.93] + - - [6400, 8976, 1, 256] + - [392, 9560.96] + - - [256, 8960, 1, 1024] + - [367, 6292.36] + - - [1792, 8976, 1, 256] + - [382, 9372.18] + - - [13824, 8976, 1, 256] + - [400, 9585.27] + - - [11776, 8976, 1, 256] + - [400, 9560.34] + - - [256, 20992, 1, 1024] + - [398, 7490.65] + - - [20480, 8976, 1, 256] + - [408, 9610.7] + - - [5888, 8976, 1, 256] + - [379, 9565.2] + - - [256, 10496, 1, 1024] + - [370, 6631.96] + - - [21248, 8976, 1, 256] + - [400, 9755.77] + - - [5120, 8976, 1, 256] + - [408, 9244.59] + - - [7168, 8976, 1, 256] + - [400, 9388.42] + - - [2048, 1536, 1, 768] + - [389, 9446.04] + - - [256, 8192, 1, 1024] + - [394, 6948.89] + - - [4096, 8976, 1, 256] + - [399, 9115.94] + - - [3328, 8976, 1, 256] + - [392, 9434.55] + - - [1280, 8976, 1, 256] + - [390, 9129.8] + - - [2560, 8976, 1, 256] + - [387, 9199.48] + - - [3072, 8976, 1, 256] + - [402, 8963.6] + - - [256, 11776, 1, 1024] + - [380, 6869.8] + - - [18688, 8976, 1, 256] + - [408, 9726.21] + - - [15104, 8976, 1, 256] + - [408, 9715.71] + - - [23552, 8976, 1, 256] + - [400, 9648.42] + - - [6144, 8976, 1, 256] + - [408, 9339.8] + - - [12544, 8976, 1, 256] + - [408, 9654.45] + - - [256, 11264, 1, 1024] + - [381, 6814.98] + - - [2048, 114, 1, 512] + - [412, 4583.5] + - - [4352, 8976, 1, 256] + - [392, 9471.4] + - - [15360, 8976, 1, 256] + - [408, 9583.77] + - - [256, 31488, 1, 1024] + - [407, 8438.01] + - - [28672, 8976, 1, 256] + - [400, 9688.85] + - - [256, 18176, 1, 1024] + - [376, 7405.09] + - - [9728, 8976, 1, 256] + - [408, 9524.15] + - - [256, 2816, 1, 1024] + - [372, 5405.66] + - - [256, 18944, 1, 1024] + - [376, 7503.41] + - - [256, 3584, 1, 1024] + - [375, 6107.15] + - - [7936, 8976, 1, 256] + - [388, 9608.31] + - - [19712, 8976, 1, 256] + - [408, 9736.25] + - - [256, 14848, 1, 1024] + - [381, 7163.42] + - - [256, 8448, 1, 1024] + - [381, 6372.56] + - - [256, 6400, 1, 1024] + - [395, 6395.71] + - - [256, 6144, 1, 1024] + - [406, 6490.22] + - - [9472, 8976, 1, 256] + - [385, 9609.92] + - - [256, 9984, 1, 1024] + - [368, 6484.75] + - - [684, 8976, 1, 256] + - [377, 8128.53] + - - [20992, 8976, 1, 256] + - [400, 9689.65] + - - [2048, 684, 1, 512] + - [384, 7241.78] + - - [2048, 114, 1, 768] + - [410, 4872.46] + - - [8960, 8976, 1, 256] + - [383, 9603.35] + - - [2048, 1536, 1, 512] + - [386, 8830.11] + - - [256, 3328, 1, 1024] + - [374, 5612.55] + - - [33536, 8976, 1, 256] + - [400, 9797.71] + - - [2048, 8976, 1, 256] + - [400, 8975.46] + - - [10496, 8976, 1, 256] + - [391, 9654.43] + - - [256, 5376, 1, 1024] + - [409, 5626.34] + - - [256, 21248, 1, 1024] + - [378, 7525.45] + - - [256, 13312, 1, 1024] + - [376, 6767.11] + - - [16128, 8976, 1, 256] + - [400, 9715.57] + - - [2304, 8976, 1, 256] + - [373, 9433.83] + - - [256, 4864, 1, 1024] + - [363, 5743.55] + - - [17152, 8976, 1, 256] + - [408, 9708.94] + - - [15872, 8976, 1, 256] + - [408, 9657.57] + - - [9984, 8976, 1, 256] + - [385, 9639.74] + - - [256, 14592, 1, 1024] + - [397, 7223.92] + - - [256, 33536, 1, 1024] + - [404, 8147.31] + - - [11264, 8976, 1, 256] + - [400, 9509.96] + - - [31488, 8976, 1, 256] + - [408, 9799.31] + - - [256, 20480, 1, 1024] + - [381, 7498.2] + - - [44505, 8976, 1, 256] + - [392, 9804.78] + - - [13568, 8976, 1, 256] + - [400, 9680.24] + - - [256, 11520, 1, 1024] + - [380, 6805.26] + - - [256, 7936, 1, 1024] + - [396, 6971.77] + - - [2048, 256, 1, 768] + - [366, 7129.13] + - - [256, 4608, 1, 1024] + - [364, 5462.91] + - - [256, 2304, 1, 1024] + - [371, 4842.69] + - - [256, 2560, 1, 1024] + - [372, 5309.25] + - - [2816, 8976, 1, 256] + - [383, 9409.56] - - [1024, 128, 1, 128] - - [353, 896.319] + - [425, 896.319] - - [4, 704, 1, 1280] - - [390, 328.976] + - [462, 328.976] - - [4, 1856, 1, 3328] - - [400, 501.461] + - [472, 501.461] - - [1856, 448, 1, 3328] - - [445, 5678.01] + - [517, 5678.01] - - [2944, 4288, 1, 1280] - - [431, 8412.49] + - [503, 8412.49] - - [2368, 64, 1, 3328] - - [381, 4914.02] + - [453, 4914.02] - - [1760, 32, 1, 1760] - - [408, 3313.04] + - [480, 3313.04] - - [2368, 5888, 1, 256] - - [431, 6489.82] + - [503, 6489.82] - - [5888, 1856, 1, 256] - - [443, 7791.98] + - [515, 7791.98] - - [128, 64, 1, 256] - - [415, 369.317] + - [487, 369.317] - - [512, 24000, 1, 1536] - - [437, 8827.47] + - [509, 8827.47] - - [128, 6784, 1, 3328] - - [437, 6537.09] + - [509, 6537.09] - - [5888, 1408, 1, 256] - - [451, 6129.71] + - [523, 6129.71] - - [5888, 1856, 1, 3328] - - [437, 7969.27] + - [509, 7969.27] - - [5056, 704, 1, 256] - - [437, 6723.92] + - [509, 6723.92] - - [2048, 400, 1, 512] - - [443, 4531.54] + - [515, 4531.54] - - [5888, 2944, 1, 3328] - - [443, 8608.14] + - [515, 8608.14] - - [1856, 4288, 1, 256] - - [443, 6297.64] + - [515, 6297.64] - - [1024, 5056, 1, 128] - - [421, 3595.47] + - [493, 3595.47] - - [5056, 5056, 1, 3328] - - [437, 8559.26] + - [509, 8559.26] - - [1408, 5888, 1, 1280] - - [432, 6797.16] + - [504, 6797.16] - - [2368, 448, 1, 128] - - [421, 2815.0] + - [493, 2815.0] - - [2368, 6784, 1, 128] - - [425, 4782.08] + - [497, 4782.08] - - [1024, 3584, 1, 3328] - - [433, 8402.54] + - [505, 8402.54] - - [512, 48000, 1, 2048] - - [437, 8162.33] + - [509, 8162.33] - - [128, 448, 1, 1280] - - [408, 2903.59] + - [480, 2903.59] - - [256, 4288, 1, 3328] - - [438, 6346.04] + - [510, 6346.04] - - [5888, 1408, 1, 1280] - - [437, 8959.55] + - [509, 8959.55] - - [704, 1856, 1, 3328] - - [432, 6955.37] + - [504, 6955.37] - - [4, 1408, 1, 128] - - [452, 60.1747] + - [524, 60.1747] - - [1024, 2368, 1, 256] - - [439, 5927.88] + - [511, 5927.88] - - [64, 4, 1, 256] - - [457, 13.3129] + - [529, 13.3129] - - [1408, 1856, 1, 1280] - - [435, 8051.68] + - [507, 8051.68] - - [1408, 64, 1, 1280] - - [411, 3400.55] + - [483, 3400.55] - - [448, 1024, 1, 1280] - - [439, 5730.02] + - [511, 5730.02] - - [6144, 24000, 1, 2048] - - [443, 7738.4] + - [515, 7738.4] - - [4096, 32, 1, 4096] - - [381, 2381.53] + - [453, 2381.53] - - [256, 1408, 1, 3328] - - [439, 4844.88] + - [511, 4844.88] - - [5056, 5056, 1, 1280] - - [443, 9090.2] + - [515, 9090.2] - - [448, 5056, 1, 256] - - [449, 4961.28] + - [521, 4961.28] - - [704, 1856, 1, 1280] - - [435, 6456.54] + - [507, 6456.54] - - [128, 5056, 1, 128] - - [364, 2251.12] + - [436, 2251.12] - - [2368, 128, 1, 256] - - [432, 3403.37] + - [504, 3403.37] - - [1760, 6400, 1, 1760] - - [431, 8959.8] + - [503, 8959.8] - - [1856, 1408, 1, 128] - - [424, 3493.16] + - [496, 3493.16] - - [64, 5056, 1, 256] - - [433, 2582.32] + - [505, 2582.32] - - [6784, 256, 1, 3328] - - [431, 7323.64] + - [503, 7323.64] - - [6784, 4288, 1, 3328] - - [433, 8542.19] + - [505, 8542.19] - - [4288, 448, 1, 256] - - [449, 5030.6] + - [521, 5030.6] - - [64, 704, 1, 128] - - [366, 375.567] + - [438, 375.567] - - [1856, 2368, 1, 3328] - - [442, 6742.44] + - [514, 6742.44] - - [4288, 2944, 1, 1280] - - [443, 8578.27] + - [515, 8578.27] - - [704, 5056, 1, 1280] - - [439, 8014.55] + - [511, 8014.55] - - [2368, 704, 1, 3328] - - [438, 6544.41] + - [510, 6544.41] - - [256, 5888, 1, 256] - - [436, 5933.0] + - [508, 5933.0] - - [1856, 4288, 1, 3328] - - [442, 7410.82] + - [514, 7410.82] - - [256, 2944, 1, 256] - - [438, 5014.08] + - [510, 5014.08] - - [5888, 1024, 1, 256] - - [443, 8069.44] + - [515, 8069.44] - - [448, 64, 1, 1280] - - [418, 2057.28] + - [490, 2057.28] - - [3072, 64, 1, 1024] - - [398, 2145.52] + - [470, 2145.52] - - [3584, 4, 1, 1280] - - [390, 498.743] + - [462, 498.743] - - [16384, 3200, 1, 4096] - - [430, 6621.53] + - [502, 6621.53] - - [2944, 64, 1, 256] - - [438, 2554.89] + - [510, 2554.89] - - [128, 4, 1, 1280] - - [400, 87.2489] + - [472, 87.2489] - - [1408, 2944, 1, 256] - - [437, 8029.45] + - [509, 8029.45] - - [256, 1856, 1, 1280] - - [432, 6170.7] + - [504, 6170.7] - - [6784, 5056, 1, 3328] - - [441, 7134.29] + - [513, 7134.29] - - [5056, 5056, 1, 256] - - [449, 6246.9] + - [521, 6246.9] - - [1408, 6784, 1, 128] - - [426, 4329.55] + - [498, 4329.55] - - [64, 1024, 1, 1280] - - [408, 3206.75] + - [480, 3206.75] - - [2944, 4, 1, 256] - - [457, 333.58] + - [529, 333.58] - - [704, 5056, 1, 128] - - [421, 4085.52] + - [493, 4085.52] - - [4, 2368, 1, 1280] - - [458, 394.767] + - [530, 394.767] - - [2368, 2944, 1, 1280] - - [437, 8634.05] + - [509, 8634.05] - - [128, 3584, 1, 1280] - - [438, 6046.25] + - [510, 6046.25] - - [6784, 6784, 1, 1280] - - [443, 8847.51] + - [515, 8847.51] - - [1408, 4288, 1, 1280] - - [443, 8236.79] + - [515, 8236.79] - - [3584, 4288, 1, 1280] - - [438, 7399.98] + - [510, 7399.98] - - [2368, 704, 1, 1280] - - [431, 6754.5] + - [503, 6754.5] - - [5056, 4288, 1, 3328] - - [437, 8569.63] + - [509, 8569.63] - - [3584, 2368, 1, 3328] - - [442, 7942.48] + - [514, 7942.48] - - [64, 704, 1, 1280] - - [411, 2363.69] + - [483, 2363.69] - - [4288, 256, 1, 256] - - [439, 4591.9] + - [511, 4591.9] - - [2944, 128, 1, 128] - - [364, 1878.39] + - [436, 1878.39] - - [6144, 32, 1, 2560] - - [409, 3334.2] + - [481, 3334.2] - - [6784, 448, 1, 1280] - - [441, 7939.3] + - [513, 7939.3] - - [1408, 2944, 1, 128] - - [425, 4096.61] + - [497, 4096.61] - - [4288, 2944, 1, 256] - - [431, 8141.23] + - [503, 8141.23] - - [5888, 704, 1, 1280] - - [432, 7516.23] + - [504, 7516.23] - - [5056, 4, 1, 3328] - - [375, 552.509] + - [447, 552.509] - - [1856, 64, 1, 1280] - - [381, 3870.86] + - [453, 3870.86] - - [1760, 16, 1, 1760] - - [393, 2181.51] + - [465, 2181.51] - - [448, 5888, 1, 128] - - [426, 3371.1] + - [498, 3371.1] - - [5888, 64, 1, 3328] - - [406, 5319.48] + - [478, 5319.48] - - [2944, 256, 1, 3328] - - [438, 7122.4] + - [510, 7122.4] - - [1024, 64, 1, 128] - - [353, 595.882] + - [425, 595.882] - - [5056, 2368, 1, 1280] - - [432, 7778.29] + - [504, 7778.29] - - [448, 3584, 1, 1280] - - [437, 6500.62] + - [509, 6500.62] - - [6784, 5888, 1, 256] - - [437, 8918.68] + - [509, 8918.68] - - [704, 1024, 1, 128] - - [421, 2627.51] + - [493, 2627.51] - - [704, 128, 1, 1280] - - [408, 3408.59] + - [480, 3408.59] - - [4, 3584, 1, 128] - - [452, 140.821] + - [524, 140.821] - - [1408, 448, 1, 1280] - - [432, 5881.54] + - [504, 5881.54] - - [1024, 1408, 1, 256] - - [436, 5647.27] + - [508, 5647.27] - - [2368, 2368, 1, 3328] - - [430, 7688.83] + - [502, 7688.83] - - [1856, 6784, 1, 128] - - [421, 4705.95] + - [493, 4705.95] - - [5056, 704, 1, 3328] - - [441, 8198.98] + - [513, 8198.98] - - [1408, 1856, 1, 256] - - [443, 6340.05] + - [515, 6340.05] - - [1408, 704, 1, 3328] - - [435, 7599.65] + - [507, 7599.65] - - [2368, 5056, 1, 256] - - [443, 8242.85] + - [515, 8242.85] - - [1408, 256, 1, 1280] - - [438, 4879.26] + - [510, 4879.26] - - [3072, 128, 1, 1024] - - [407, 2525.52] + - [479, 2525.52] - - [3584, 2368, 1, 1280] - - [439, 8132.72] + - [511, 8132.72] - - [4288, 64, 1, 3328] - - [394, 5156.53] + - [466, 5156.53] - - [2368, 4, 1, 1280] - - [456, 482.75] + - [528, 482.75] - - [704, 5888, 1, 256] - - [446, 5398.75] + - [518, 5398.75] - - [6784, 2944, 1, 128] - - [422, 4748.99] + - [494, 4748.99] - - [2560, 1600, 1, 2560] - - [433, 7355.0] + - [505, 7355.0] - - [4288, 6784, 1, 3328] - - [430, 7409.41] + - [502, 7409.41] - - [2944, 256, 1, 256] - - [438, 5077.42] + - [510, 5077.42] - - [2944, 6784, 1, 3328] - - [443, 8068.05] + - [515, 8068.05] - - [704, 1408, 1, 3328] - - [438, 7239.43] + - [510, 7239.43] - - [6144, 5984, 1, 2048] - - [437, 7176.07] + - [509, 7176.07] - - [3584, 704, 1, 3328] - - [443, 6642.86] + - [515, 6642.86] - - [2944, 256, 1, 128] - - [422, 2644.54] + - [494, 2644.54] - - [6784, 4, 1, 1280] - - [454, 402.487] + - [526, 402.487] - - [1024, 64, 1, 1280] - - [408, 2602.03] + - [480, 2602.03] - - [2048, 1600, 1, 512] - - [435, 5592.5] + - [507, 5592.5] - - [448, 4288, 1, 256] - - [433, 6128.99] + - [505, 6128.99] - - [64, 3584, 1, 3328] - - [374, 5534.93] + - [446, 5534.93] - - [1856, 4288, 1, 128] - - [424, 4400.11] + - [496, 4400.11] - - [704, 2368, 1, 1280] - - [449, 5735.02] + - [521, 5735.02] - - [1856, 2368, 1, 1280] - - [446, 6482.4] + - [518, 6482.4] - - [2368, 128, 1, 3328] - - [419, 4717.32] + - [491, 4717.32] - - [2944, 128, 1, 256] - - [446, 3276.9] + - [518, 3276.9] - - [448, 1408, 1, 256] - - [438, 4852.28] + - [510, 4852.28] - - [1856, 4288, 1, 1280] - - [433, 8132.96] + - [505, 8132.96] - - [64, 5056, 1, 3328] - - [409, 5097.06] + - [481, 5097.06] - - [4, 704, 1, 256] - - [456, 128.831] + - [528, 128.831] - - [1024, 448, 1, 128] - - [421, 1816.94] + - [493, 1816.94] - - [704, 4, 1, 1280] - - [457, 328.976] + - [529, 328.976] - - [704, 256, 1, 128] - - [425, 876.569] + - [497, 876.569] - - [704, 2944, 1, 128] - - [425, 3734.47] + - [497, 3734.47] - - [1408, 1024, 1, 1280] - - [433, 7224.85] + - [505, 7224.85] - - [704, 6784, 1, 256] - - [437, 7354.77] + - [509, 7354.77] - - [6784, 704, 1, 256] - - [433, 6012.28] + - [505, 6012.28] - - [5056, 1408, 1, 128] - - [426, 4311.28] + - [498, 4311.28] - - [2048, 7000, 1, 2048] - - [437, 7232.07] + - [509, 7232.07] - - [256, 3584, 1, 3328] - - [441, 7006.0] + - [513, 7006.0] - - [4, 5888, 1, 3328] - - [459, 534.612] + - [531, 534.612] - - [128, 1408, 1, 128] - - [351, 1177.07] + - [423, 1177.07] - - [3584, 4288, 1, 3328] - - [443, 7135.0] + - [515, 7135.0] - - [5888, 1856, 1, 1280] - - [431, 8395.03] + - [503, 8395.03] - - [256, 1408, 1, 256] - - [432, 3977.46] + - [504, 3977.46] - - [5056, 64, 1, 1280] - - [432, 4257.78] + - [504, 4257.78] - - [1024, 704, 1, 256] - - [432, 5036.93] + - [504, 5036.93] - - [448, 128, 1, 128] - - [353, 533.533] + - [425, 533.533] - - [2368, 3584, 1, 1280] - - [437, 8272.43] + - [509, 8272.43] - - [2368, 6784, 1, 1280] - - [430, 8288.24] + - [502, 8288.24] - - [1856, 4, 1, 1280] - - [370, 464.1] + - [442, 464.1] - - [448, 448, 1, 256] - - [432, 3058.45] + - [504, 3058.45] - - [2944, 3584, 1, 3328] - - [437, 8557.63] + - [509, 8557.63] - - [7680, 32, 1, 2560] - - [409, 3729.03] + - [481, 3729.03] - - [128, 4288, 1, 128] - - [352, 2116.2] + - [424, 2116.2] - - [256, 256, 1, 3328] - - [408, 4051.06] + - [480, 4051.06] - - [128, 1024, 1, 3328] - - [381, 5139.21] + - [453, 5139.21] - - [4, 1408, 1, 3328] - - [400, 502.871] + - [472, 502.871] - - [6784, 2944, 1, 256] - - [431, 8446.06] + - [503, 8446.06] - - [64, 1856, 1, 1280] - - [373, 3870.86] + - [445, 3870.86] - - [6784, 64, 1, 128] - - [421, 1877.62] + - [493, 1877.62] - - [4288, 2368, 1, 3328] - - [441, 8419.4] + - [513, 8419.4] - - [1856, 2368, 1, 256] - - [435, 6887.48] + - [507, 6887.48] - - [3584, 256, 1, 128] - - [425, 2496.71] + - [497, 2496.71] - - [3584, 6784, 1, 3328] - - [437, 7626.18] + - [509, 7626.18] - - [256, 1024, 1, 256] - - [438, 3095.53] + - [510, 3095.53] - - [4, 6784, 1, 3328] - - [400, 589.274] + - [472, 589.274] - - [1024, 5888, 1, 3328] - - [437, 7794.35] + - [509, 7794.35] - - [1024, 128, 1, 1280] - - [410, 3130.18] + - [482, 3130.18] - - [3072, 32, 1, 1024] - - [397, 1675.59] + - [469, 1675.59] - - [6144, 24000, 1, 2560] - - [437, 7256.14] + - [509, 7256.14] - - [5056, 4288, 1, 1280] - - [435, 8349.03] + - [507, 8349.03] - - [5888, 64, 1, 256] - - [384, 2593.35] + - [456, 2593.35] - - [6784, 1856, 1, 3328] - - [431, 8087.38] + - [503, 8087.38] - - [1408, 5056, 1, 1280] - - [433, 7802.63] + - [505, 7802.63] - - [1856, 256, 1, 1280] - - [438, 6150.73] + - [510, 6150.73] - - [64, 5888, 1, 3328] - - [405, 5301.49] + - [477, 5301.49] - - [2368, 2368, 1, 1280] - - [435, 8233.43] + - [507, 8233.43] - - [2944, 5888, 1, 128] - - [428, 3745.51] + - [500, 3745.51] - - [704, 5888, 1, 1280] - - [433, 8245.04] + - [505, 8245.04] - - [2368, 3584, 1, 128] - - [425, 4523.43] + - [497, 4523.43] - - [1856, 5056, 1, 128] - - [422, 4498.08] + - [494, 4498.08] - - [704, 1024, 1, 1280] - - [446, 5479.59] + - [518, 5479.59] - - [448, 256, 1, 3328] - - [389, 5048.8] + - [461, 5048.8] - - [448, 1856, 1, 128] - - [422, 2936.92] + - [494, 2936.92] - - [8192, 3200, 1, 2048] - - [431, 6713.12] + - [503, 6713.12] - - [128, 1024, 1, 128] - - [367, 998.744] + - [439, 998.744] - - [2944, 4, 1, 128] - - [452, 98.7471] + - [524, 98.7471] - - [1024, 704, 1, 1280] - - [438, 5897.0] + - [510, 5897.0] - - [128, 5888, 1, 256] - - [438, 5014.08] + - [510, 5014.08] - - [1024, 5056, 1, 1280] - - [437, 8857.81] + - [509, 8857.81] - - [4288, 1024, 1, 256] - - [443, 6195.39] + - [515, 6195.39] - - [2944, 2368, 1, 128] - - [421, 4442.23] + - [493, 4442.23] - - [704, 704, 1, 3328] - - [438, 6764.4] + - [510, 6764.4] - - [704, 1408, 1, 1280] - - [439, 7383.58] + - [511, 7383.58] - - [5888, 448, 1, 1280] - - [437, 7299.49] + - [509, 7299.49] - - [3584, 256, 1, 3328] - - [435, 7061.72] + - [507, 7061.72] - - [704, 5888, 1, 3328] - - [439, 8142.42] + - [511, 8142.42] - - [704, 1856, 1, 128] - - [425, 3139.14] + - [497, 3139.14] - - [448, 448, 1, 3328] - - [403, 5063.34] + - [475, 5063.34] - - [4, 4288, 1, 128] - - [453, 64.9775] + - [525, 64.9775] - - [128, 704, 1, 1280] - - [373, 3400.55] + - [445, 3400.55] - - [3584, 2944, 1, 256] - - [443, 7982.14] + - [515, 7982.14] - - [3584, 4, 1, 128] - - [452, 105.318] + - [524, 105.318] - - [1856, 128, 1, 3328] - - [404, 5442.19] + - [476, 5442.19] - - [4, 64, 1, 1280] - - [458, 42.3268] + - [530, 42.3268] - - [2944, 448, 1, 128] - - [421, 2926.95] + - [493, 2926.95] - - [128, 2944, 1, 1280] - - [432, 5109.69] + - [504, 5109.69] - - [64, 64, 1, 3328] - - [400, 1252.99] + - [472, 1252.99] - - [448, 2944, 1, 1280] - - [441, 6684.47] + - [513, 6684.47] - - [512, 24000, 1, 2048] - - [437, 7939.03] + - [509, 7939.03] - - [128, 256, 1, 3328] - - [418, 3276.9] + - [490, 3276.9] - - [1408, 5056, 1, 3328] - - [443, 8959.21] + - [515, 8959.21] - - [1856, 1856, 1, 3328] - - [433, 8006.17] + - [505, 8006.17] - - [3584, 128, 1, 256] - - [438, 4292.52] + - [510, 4292.52] - - [2560, 800, 1, 2560] - - [433, 6262.48] + - [505, 6262.48] - - [448, 1408, 1, 3328] - - [449, 4997.35] + - [521, 4997.35] - - [2368, 2368, 1, 256] - - [451, 4978.94] + - [523, 4978.94] - - [4288, 4288, 1, 1280] - - [430, 8617.78] + - [502, 8617.78] - - [64, 448, 1, 1280] - - [376, 2057.28] + - [448, 2057.28] - - [5888, 1024, 1, 1280] - - [448, 6848.17] + - [520, 6848.17] - - [1408, 4288, 1, 256] - - [431, 7077.01] + - [503, 7077.01] - - [448, 4, 1, 256] - - [456, 84.4294] + - [528, 84.4294] - - [5888, 448, 1, 128] - - [425, 3493.91] + - [497, 3493.91] - - [512, 48000, 1, 2560] - - [443, 8960.13] + - [515, 8960.13] - - [35, 8457, 1, 1760] - - [345, 3934.78] + - [417, 3934.78] - - [704, 6784, 1, 3328] - - [430, 8180.88] + - [502, 8180.88] - - [2560, 6400, 1, 2560] - - [431, 7822.24] + - [503, 7822.24] - - [5056, 1024, 1, 1280] - - [433, 8357.38] + - [505, 8357.38] - - [448, 5888, 1, 3328] - - [437, 7505.28] + - [509, 7505.28] - - [128, 4, 1, 128] - - [452, 0.662251] + - [524, 0.662251] - - [1024, 2944, 1, 1280] - - [437, 8406.24] + - [509, 8406.24] - - [5056, 5888, 1, 1280] - - [437, 8819.76] + - [509, 8819.76] - - [4288, 5888, 1, 128] - - [422, 3522.32] + - [494, 3522.32] - - [256, 3584, 1, 256] - - [433, 5883.89] + - [505, 5883.89] - - [1408, 3584, 1, 128] - - [421, 4283.41] + - [493, 4283.41] - - [256, 2944, 1, 3328] - - [441, 5670.63] + - [513, 5670.63] - - [448, 3584, 1, 128] - - [425, 3171.72] + - [497, 3171.72] - - [5888, 2944, 1, 1280] - - [443, 8198.86] + - [515, 8198.86] - - [4, 6784, 1, 1280] - - [390, 553.896] + - [462, 553.896] - - [2368, 5888, 1, 128] - - [421, 4787.32] + - [493, 4787.32] - - [8448, 16, 1, 2816] - - [380, 2452.63] + - [452, 2452.63] - - [64, 2944, 1, 128] - - [353, 1376.66] + - [425, 1376.66] - - [2368, 4, 1, 256] - - [375, 278.177] + - [447, 278.177] - - [3584, 5888, 1, 256] - - [451, 6233.66] + - [523, 6233.66] - - [2368, 1024, 1, 128] - - [422, 3781.51] + - [494, 3781.51] - - [2368, 704, 1, 128] - - [422, 3198.32] + - [494, 3198.32] - - [3584, 2944, 1, 1280] - - [433, 8045.68] + - [505, 8045.68] - - [3584, 2368, 1, 128] - - [422, 4188.57] + - [494, 4188.57] - - [5056, 704, 1, 128] - - [425, 4019.21] + - [497, 4019.21] - - [448, 2368, 1, 128] - - [427, 2522.21] + - [499, 2522.21] - - [5056, 1408, 1, 3328] - - [435, 8349.93] + - [507, 8349.93] - - [1408, 704, 1, 256] - - [441, 4741.42] + - [513, 4741.42] - - [6784, 1024, 1, 3328] - - [443, 8769.5] + - [515, 8769.5] - - [6784, 2944, 1, 3328] - - [440, 7319.74] + - [512, 7319.74] - - [2944, 5056, 1, 3328] - - [430, 8889.76] + - [502, 8889.76] - - [1856, 1856, 1, 256] - - [433, 6309.84] + - [505, 6309.84] - - [1024, 5888, 1, 128] - - [424, 3759.6] + - [496, 3759.6] - - [6784, 2368, 1, 1280] - - [433, 8298.4] + - [505, 8298.4] - - [256, 4, 1, 128] - - [452, 7.10171] + - [524, 7.10171] - - [4288, 5888, 1, 1280] - - [437, 8365.28] + - [509, 8365.28] - - [4288, 4288, 1, 256] - - [437, 6513.78] + - [509, 6513.78] - - [8448, 32, 1, 2816] - - [408, 4257.74] + - [480, 4257.74] - - [448, 2944, 1, 3328] - - [441, 6875.62] + - [513, 6875.62] - - [5888, 4, 1, 128] - - [452, 163.94] + - [524, 163.94] - - [4288, 1856, 1, 1280] - - [437, 8402.91] + - [509, 8402.91] - - [1856, 2944, 1, 3328] - - [437, 6612.21] + - [509, 6612.21] - - [256, 6784, 1, 3328] - - [438, 7358.7] + - [510, 7358.7] - - [64, 5888, 1, 256] - - [432, 3359.05] + - [504, 3359.05] - - [256, 5056, 1, 128] - - [425, 2489.21] + - [497, 2489.21] - - [5056, 1024, 1, 256] - - [443, 8077.87] + - [515, 8077.87] - - [704, 64, 1, 3328] - - [387, 3288.4] + - [459, 3288.4] - - [5056, 1856, 1, 3328] - - [441, 8171.13] + - [513, 8171.13] - - [4, 2944, 1, 3328] - - [400, 546.843] + - [472, 546.843] - - [4, 5056, 1, 256] - - [375, 378.561] + - [447, 378.561] - - [1856, 1408, 1, 256] - - [443, 6320.88] + - [515, 6320.88] - - [8448, 12000, 1, 2816] - - [441, 7365.87] + - [513, 7365.87] - - [6784, 128, 1, 3328] - - [438, 6366.57] + - [510, 6366.57] - - [4288, 1408, 1, 128] - - [421, 4451.7] + - [493, 4451.7] - - [1856, 5888, 1, 3328] - - [439, 8619.76] + - [511, 8619.76] - - [4288, 5056, 1, 256] - - [443, 7289.05] + - [515, 7289.05] - - [1408, 128, 1, 1280] - - [381, 4291.15] + - [453, 4291.15] - - [4096, 800, 1, 1024] - - [432, 5867.89] + - [504, 5867.89] - - [5056, 256, 1, 3328] - - [438, 7527.61] + - [510, 7527.61] - - [704, 704, 1, 256] - - [438, 4417.85] + - [510, 4417.85] - - [1024, 5888, 1, 1280] - - [443, 8674.57] + - [515, 8674.57] - - [6784, 2368, 1, 128] - - [421, 4724.08] + - [493, 4724.08] - - [4, 5056, 1, 1280] - - [390, 540.307] + - [462, 540.307] - - [256, 64, 1, 1280] - - [392, 1515.38] + - [464, 1515.38] - - [128, 1856, 1, 1280] - - [432, 4574.21] + - [504, 4574.21] - - [1856, 1024, 1, 1280] - - [437, 7741.61] + - [509, 7741.61] - - [6784, 4288, 1, 1280] - - [443, 8521.29] + - [515, 8521.29] - - [2560, 64, 1, 2560] - - [374, 3504.7] + - [446, 3504.7] - - [1856, 1856, 1, 1280] - - [433, 7779.31] + - [505, 7779.31] - - [4096, 400, 1, 1024] - - [443, 4157.81] + - [515, 4157.81] - - [3072, 24000, 1, 1024] - - [443, 8663.45] + - [515, 8663.45] - - [128, 4288, 1, 3328] - - [389, 5674.23] + - [461, 5674.23] - - [4, 2368, 1, 3328] - - [400, 525.48] + - [472, 525.48] - - [5888, 1856, 1, 128] - - [425, 4099.74] + - [497, 4099.74] - - [448, 704, 1, 1280] - - [438, 4309.47] + - [510, 4309.47] - - [128, 5056, 1, 1280] - - [381, 5068.46] + - [453, 5068.46] - - [1024, 448, 1, 3328] - - [441, 6077.82] + - [513, 6077.82] - - [1856, 704, 1, 1280] - - [449, 6257.49] + - [521, 6257.49] - - [5056, 3584, 1, 128] - - [422, 4598.52] + - [494, 4598.52] - - [5888, 5888, 1, 3328] - - [443, 8058.25] + - [515, 8058.25] - - [6784, 1024, 1, 256] - - [443, 5120.99] + - [515, 5120.99] - - [2944, 2368, 1, 256] - - [434, 6523.03] + - [506, 6523.03] - - [256, 448, 1, 256] - - [384, 1816.94] + - [456, 1816.94] - - [5056, 5888, 1, 3328] - - [436, 6722.41] + - [508, 6722.41] - - [1856, 1024, 1, 256] - - [443, 6632.31] + - [515, 6632.31] - - [512, 48000, 1, 1536] - - [437, 8556.01] + - [509, 8556.01] - - [3584, 448, 1, 1280] - - [432, 6567.09] + - [504, 6567.09] - - [8448, 5984, 1, 2816] - - [437, 8990.66] + - [509, 8990.66] - - [448, 5888, 1, 256] - - [437, 6220.47] + - [509, 6220.47] - - [704, 64, 1, 128] - - [350, 450.66] + - [422, 450.66] - - [1408, 6784, 1, 3328] - - [430, 8478.68] + - [502, 8478.68] - - [448, 1024, 1, 128] - - [429, 1844.33] + - [501, 1844.33] - - [4288, 704, 1, 128] - - [425, 3895.26] + - [497, 3895.26] - - [128, 1856, 1, 128] - - [356, 1456.46] + - [428, 1456.46] - - [448, 2368, 1, 3328] - - [435, 5538.04] + - [507, 5538.04] - - [5056, 64, 1, 128] - - [421, 1648.94] + - [493, 1648.94] - - [5056, 2944, 1, 256] - - [437, 8230.87] + - [509, 8230.87] - - [6784, 5888, 1, 128] - - [421, 4873.19] + - [493, 4873.19] - - [1024, 700, 1, 512] - - [435, 4445.37] + - [507, 4445.37] - - [704, 1024, 1, 256] - - [433, 4707.99] + - [505, 4707.99] - - [1024, 4, 1, 256] - - [375, 174.863] + - [447, 174.863] - - [2944, 704, 1, 128] - - [425, 3483.42] + - [497, 3483.42] - - [128, 6784, 1, 1280] - - [433, 6522.93] + - [505, 6522.93] - - [1408, 3584, 1, 3328] - - [437, 8673.59] + - [509, 8673.59] - - [2368, 6784, 1, 256] - - [433, 7941.76] + - [505, 7941.76] - - [5056, 1408, 1, 1280] - - [437, 8801.01] + - [509, 8801.01] - - [256, 256, 1, 128] - - [362, 551.982] + - [434, 551.982] - - [5056, 4288, 1, 128] - - [429, 3793.64] + - [501, 3793.64] - - [1408, 1856, 1, 128] - - [421, 3067.74] + - [493, 3067.74] - - [1408, 5888, 1, 3328] - - [437, 9148.97] + - [509, 9148.97] - - [1856, 256, 1, 256] - - [433, 4319.52] + - [505, 4319.52] - - [6784, 6784, 1, 256] - - [433, 7668.53] + - [505, 7668.53] - - [64, 256, 1, 128] - - [367, 131.172] + - [439, 131.172] - - [4288, 2368, 1, 128] - - [422, 4582.99] + - [494, 4582.99] - - [256, 4288, 1, 1280] - - [432, 6058.61] + - [504, 6058.61] - - [2368, 2944, 1, 256] - - [437, 8016.07] + - [509, 8016.07] - - [4, 1856, 1, 256] - - [454, 252.832] + - [526, 252.832] - - [3584, 1856, 1, 1280] - - [433, 7760.24] + - [505, 7760.24] - - [6784, 6784, 1, 128] - - [422, 4970.14] + - [494, 4970.14] - - [256, 1856, 1, 128] - - [428, 1580.59] + - [500, 1580.59] - - [704, 64, 1, 1280] - - [417, 2556.47] + - [489, 2556.47] - - [5888, 5056, 1, 256] - - [437, 8216.67] + - [509, 8216.67] - - [8448, 48000, 1, 2816] - - [443, 4082.89] + - [515, 4082.89] - - [3584, 448, 1, 256] - - [437, 5518.92] + - [509, 5518.92] - - [448, 4288, 1, 128] - - [425, 3415.25] + - [497, 3415.25] - - [7680, 64, 1, 2560] - - [386, 5162.1] + - [458, 5162.1] - - [256, 6784, 1, 256] - - [437, 6272.62] + - [509, 6272.62] - - [1408, 4288, 1, 128] - - [425, 4343.63] + - [497, 4343.63] - - [2944, 704, 1, 3328] - - [432, 7679.71] + - [504, 7679.71] - - [128, 448, 1, 256] - - [372, 1422.59] + - [444, 1422.59] - - [5056, 256, 1, 1280] - - [439, 5052.39] + - [511, 5052.39] - - [2560, 32, 1, 2560] - - [395, 3106.07] + - [467, 3106.07] - - [3584, 3584, 1, 256] - - [443, 8260.57] + - [515, 8260.57] - - [448, 1408, 1, 128] - - [421, 2397.38] + - [493, 2397.38] - - [128, 256, 1, 1280] - - [376, 2340.67] + - [448, 2340.67] - - [3584, 5056, 1, 256] - - [443, 7347.56] + - [515, 7347.56] - - [6784, 128, 1, 256] - - [433, 5591.1] + - [505, 5591.1] - - [4288, 4, 1, 256] - - [375, 354.206] + - [447, 354.206] - - [704, 448, 1, 256] - - [438, 3492.33] + - [510, 3492.33] - - [2944, 2368, 1, 1280] - - [445, 6661.71] + - [517, 6661.71] - - [448, 64, 1, 3328] - - [417, 3058.45] + - [489, 3058.45] - - [1408, 3584, 1, 256] - - [443, 7966.59] + - [515, 7966.59] - - [3584, 4, 1, 3328] - - [456, 605.559] + - [528, 605.559] - - [6784, 3584, 1, 256] - - [433, 7525.41] + - [505, 7525.41] - - [256, 128, 1, 128] - - [365, 276.041] + - [437, 276.041] - - [704, 1408, 1, 128] - - [422, 3109.85] + - [494, 3109.85] - - [4, 2368, 1, 256] - - [456, 283.375] + - [528, 283.375] - - [4288, 128, 1, 1280] - - [438, 5132.65] + - [510, 5132.65] - - [128, 1408, 1, 256] - - [432, 2733.35] + - [504, 2733.35] - - [4, 2944, 1, 256] - - [454, 314.127] + - [526, 314.127] - - [64, 128, 1, 3328] - - [402, 1514.71] + - [474, 1514.71] - - [5056, 2368, 1, 128] - - [426, 3449.17] + - [498, 3449.17] - - [2944, 2944, 1, 3328] - - [430, 8169.03] + - [502, 8169.03] - - [5056, 6784, 1, 256] - - [450, 5792.77] + - [522, 5792.77] - - [1856, 3584, 1, 128] - - [427, 4213.5] + - [499, 4213.5] - - [128, 2944, 1, 128] - - [351, 1970.46] + - [423, 1970.46] - - [35, 8457, 1, 2560] - - [346, 3525.15] + - [418, 3525.15] - - [1024, 704, 1, 3328] - - [432, 6784.99] + - [504, 6784.99] - - [6784, 448, 1, 256] - - [441, 6544.88] + - [513, 6544.88] - - [3584, 6784, 1, 128] - - [421, 4623.6] + - [493, 4623.6] - - [128, 4288, 1, 256] - - [435, 3606.6] + - [507, 3606.6] - - [704, 448, 1, 3328] - - [432, 4478.01] + - [504, 4478.01] - - [128, 128, 1, 3328] - - [417, 2177.65] + - [489, 2177.65] - - [5056, 1856, 1, 256] - - [451, 5608.72] + - [523, 5608.72] - - [4608, 5984, 1, 1536] - - [440, 7859.85] + - [512, 7859.85] - - [256, 128, 1, 256] - - [376, 998.744] + - [448, 998.744] - - [1760, 3200, 1, 1760] - - [433, 8179.64] + - [505, 8179.64] - - [1024, 1856, 1, 256] - - [443, 6143.27] + - [515, 6143.27] - - [4096, 1600, 1, 1024] - - [451, 5851.52] + - [523, 5851.52] - - [4288, 64, 1, 128] - - [356, 1372.26] + - [428, 1372.26] - - [256, 448, 1, 3328] - - [395, 4795.1] + - [467, 4795.1] - - [1408, 6784, 1, 1280] - - [437, 8426.5] + - [509, 8426.5] - - [3584, 3584, 1, 1280] - - [437, 7556.56] + - [509, 7556.56] - - [7680, 24000, 1, 2560] - - [430, 5019.19] + - [502, 5019.19] - - [64, 2368, 1, 1280] - - [381, 4061.8] + - [453, 4061.8] - - [448, 2368, 1, 1280] - - [432, 5928.77] + - [504, 5928.77] - - [4608, 48000, 1, 1536] - - [437, 6937.4] + - [509, 6937.4] - - [5888, 5888, 1, 128] - - [422, 3744.0] + - [494, 3744.0] - - [64, 6784, 1, 3328] - - [432, 5988.72] + - [504, 5988.72] - - [2944, 256, 1, 1280] - - [438, 6717.97] + - [510, 6717.97] - - [2048, 16, 1, 2048] - - [390, 1210.58] + - [462, 1210.58] - - [256, 2368, 1, 128] - - [425, 1936.07] + - [497, 1936.07] - - [5056, 2368, 1, 3328] - - [443, 8875.63] + - [515, 8875.63] - - [2944, 4288, 1, 256] - - [437, 8063.24] + - [509, 8063.24] - - [1408, 3584, 1, 1280] - - [433, 8197.07] + - [505, 8197.07] - - [2368, 64, 1, 256] - - [432, 2365.79] + - [504, 2365.79] - - [64, 448, 1, 3328] - - [418, 3027.4] + - [490, 3027.4] - - [704, 128, 1, 3328] - - [389, 4452.19] + - [461, 4452.19] - - [8192, 1600, 1, 2048] - - [437, 7229.93] + - [509, 7229.93] - - [1856, 704, 1, 256] - - [439, 5545.45] + - [511, 5545.45] - - [4, 4288, 1, 1280] - - [390, 523.825] + - [462, 523.825] - - [1408, 448, 1, 3328] - - [444, 4789.4] + - [516, 4789.4] - - [1024, 4, 1, 3328] - - [370, 504.223] + - [442, 504.223] - - [512, 24000, 1, 2560] - - [443, 8903.62] + - [515, 8903.62] - - [2368, 6784, 1, 3328] - - [443, 8311.14] + - [515, 8311.14] - - [1856, 1408, 1, 1280] - - [433, 8160.11] + - [505, 8160.11] - - [1856, 448, 1, 1280] - - [435, 6243.07] + - [507, 6243.07] - - [6784, 704, 1, 128] - - [421, 4069.05] + - [493, 4069.05] - - [4, 4, 1, 256] - - [390, 0.842029] + - [462, 0.842029] - - [128, 5888, 1, 128] - - [421, 2328.02] + - [493, 2328.02] - - [1408, 5888, 1, 256] - - [432, 6986.91] + - [504, 6986.91] - - [704, 2944, 1, 1280] - - [433, 7905.03] + - [505, 7905.03] - - [4288, 64, 1, 1280] - - [408, 3828.27] + - [480, 3828.27] - - [256, 64, 1, 256] - - [383, 655.46] + - [455, 655.46] - - [704, 1856, 1, 256] - - [441, 5444.37] + - [513, 5444.37] - - [704, 6784, 1, 128] - - [421, 4319.77] + - [493, 4319.77] - - [3584, 704, 1, 1280] - - [441, 7726.43] + - [513, 7726.43] - - [256, 128, 1, 1280] - - [376, 2184.63] + - [448, 2184.63] - - [5888, 2368, 1, 256] - - [443, 8192.69] + - [515, 8192.69] - - [256, 2368, 1, 1280] - - [438, 5675.54] + - [510, 5675.54] - - [2944, 6784, 1, 128] - - [426, 4248.35] + - [498, 4248.35] - - [3584, 448, 1, 3328] - - [437, 6560.77] + - [509, 6560.77] - - [1408, 4, 1, 256] - - [455, 176.79] + - [527, 176.79] - - [704, 2368, 1, 3328] - - [438, 7085.31] + - [510, 7085.31] - - [2944, 448, 1, 256] - - [434, 3412.0] + - [506, 3412.0] - - [1856, 448, 1, 128] - - [422, 2748.82] + - [494, 2748.82] - - [4288, 4, 1, 3328] - - [390, 553.648] + - [462, 553.648] - - [2368, 128, 1, 1280] - - [411, 4173.65] + - [483, 4173.65] - - [256, 5888, 1, 128] - - [426, 2860.98] + - [498, 2860.98] - - [64, 6784, 1, 256] - - [439, 3637.18] + - [511, 3637.18] - - [64, 5056, 1, 1280] - - [438, 4289.53] + - [510, 4289.53] - - [4, 6784, 1, 128] - - [452, 160.906] + - [524, 160.906] - - [2048, 3200, 1, 512] - - [439, 6927.09] + - [511, 6927.09] - - [2944, 2944, 1, 1280] - - [431, 6267.85] + - [503, 6267.85] - - [5056, 448, 1, 3328] - - [432, 7400.36] + - [504, 7400.36] - - [4, 3584, 1, 1280] - - [390, 499.83] + - [462, 499.83] - - [1408, 128, 1, 128] - - [367, 1037.36] + - [439, 1037.36] - - [6784, 704, 1, 3328] - - [438, 7633.95] + - [510, 7633.95] - - [128, 64, 1, 1280] - - [390, 1170.39] + - [462, 1170.39] - - [2368, 256, 1, 1280] - - [438, 5609.89] + - [510, 5609.89] - - [4, 448, 1, 3328] - - [458, 358.5] + - [530, 358.5] - - [5888, 4288, 1, 128] - - [426, 4521.74] + - [498, 4521.74] - - [4, 5888, 1, 256] - - [390, 353.933] + - [462, 353.933] - - [1408, 2944, 1, 3328] - - [431, 8951.41] + - [503, 8951.41] - - [3584, 704, 1, 128] - - [421, 3395.41] + - [493, 3395.41] - - [4608, 12000, 1, 1536] - - [430, 6609.99] + - [502, 6609.99] - - [64, 1024, 1, 256] - - [376, 1588.85] + - [448, 1588.85] - - [5056, 5056, 1, 128] - - [421, 4080.81] + - [493, 4080.81] - - [2368, 448, 1, 1280] - - [432, 5423.04] + - [504, 5423.04] - - [128, 3584, 1, 256] - - [438, 4705.25] + - [510, 4705.25] - - [704, 448, 1, 1280] - - [435, 3961.07] + - [507, 3961.07] - - [8192, 800, 1, 2048] - - [433, 6306.36] + - [505, 6306.36] - - [448, 5056, 1, 128] - - [425, 3709.56] + - [497, 3709.56] - - [256, 4, 1, 1280] - - [457, 163.94] + - [529, 163.94] - - [5056, 3584, 1, 256] - - [430, 7008.34] + - [502, 7008.34] - - [2368, 4, 1, 3328] - - [390, 496.366] + - [462, 496.366] - - [1408, 5056, 1, 128] - - [425, 4175.37] + - [497, 4175.37] - - [2944, 3584, 1, 128] - - [421, 4659.79] + - [493, 4659.79] - - [3584, 2368, 1, 256] - - [443, 5851.87] + - [515, 5851.87] - - [128, 3584, 1, 3328] - - [433, 6105.04] + - [505, 6105.04] - - [128, 1024, 1, 1280] - - [373, 3848.09] + - [445, 3848.09] - - [8448, 24000, 1, 2816] - - [443, 5128.64] + - [515, 5128.64] - - [64, 704, 1, 256] - - [376, 1253.83] + - [448, 1253.83] - - [4288, 256, 1, 1280] - - [432, 5625.86] + - [504, 5625.86] - - [3584, 3584, 1, 3328] - - [437, 8206.15] + - [509, 8206.15] - - [4, 704, 1, 128] - - [452, 29.5484] + - [524, 29.5484] - - [5888, 6784, 1, 256] - - [439, 8248.75] + - [511, 8248.75] - - [4288, 2944, 1, 3328] - - [437, 8657.12] + - [509, 8657.12] - - [2944, 64, 1, 128] - - [356, 1240.7] + - [428, 1240.7] - - [1024, 128, 1, 3328] - - [381, 4433.1] + - [453, 4433.1] - - [1024, 16, 1, 500000] - - [344, 2571.15] + - [416, 2571.15] - - [4288, 128, 1, 3328] - - [381, 5716.85] + - [453, 5716.85] - - [7680, 128, 1, 2560] - - [379, 5488.1] + - [451, 5488.1] - - [256, 5056, 1, 1280] - - [439, 6380.06] + - [511, 6380.06] - - [1408, 256, 1, 128] - - [425, 1633.83] + - [497, 1633.83] - - [2944, 5888, 1, 3328] - - [434, 7849.02] + - [506, 7849.02] - - [6784, 5888, 1, 1280] - - [443, 9047.72] + - [515, 9047.72] - - [2048, 800, 1, 512] - - [438, 4841.17] + - [510, 4841.17] - - [704, 128, 1, 256] - - [383, 1567.27] + - [455, 1567.27] - - [5888, 4288, 1, 1280] - - [437, 7982.93] + - [509, 7982.93] - - [1024, 24000, 1, 2048] - - [439, 5774.4] + - [511, 5774.4] - - [448, 256, 1, 1280] - - [373, 3707.19] + - [445, 3707.19] - - [5888, 3584, 1, 128] - - [426, 3804.5] + - [498, 3804.5] - - [1024, 2944, 1, 128] - - [421, 3308.36] + - [493, 3308.36] - - [5056, 4, 1, 1280] - - [454, 469.062] + - [526, 469.062] - - [256, 1408, 1, 1280] - - [432, 4899.99] + - [504, 4899.99] - - [3072, 16, 1, 1024] - - [390, 1233.72] + - [462, 1233.72] - - [704, 3584, 1, 128] - - [421, 3919.53] + - [493, 3919.53] - - [5888, 448, 1, 3328] - - [451, 6095.71] + - [523, 6095.71] - - [2368, 4288, 1, 1280] - - [433, 8338.4] + - [505, 8338.4] - - [4288, 2944, 1, 128] - - [425, 3946.6] + - [497, 3946.6] - - [1024, 6784, 1, 3328] - - [439, 7494.38] + - [511, 7494.38] - - [128, 2368, 1, 256] - - [438, 2895.42] + - [510, 2895.42] - - [6784, 64, 1, 3328] - - [432, 5964.99] + - [504, 5964.99] - - [5056, 2944, 1, 3328] - - [443, 6605.63] + - [515, 6605.63] - - [448, 128, 1, 256] - - [376, 1339.52] + - [448, 1339.52] - - [2944, 3584, 1, 256] - - [439, 7165.66] + - [511, 7165.66] - - [1408, 1408, 1, 3328] - - [443, 8332.96] + - [515, 8332.96] - - [1856, 128, 1, 1280] - - [438, 4498.43] + - [510, 4498.43] - - [3584, 3584, 1, 128] - - [422, 4000.11] + - [494, 4000.11] - - [64, 3584, 1, 256] - - [449, 2383.23] + - [521, 2383.23] - - [1408, 4, 1, 3328] - - [400, 423.008] + - [472, 423.008] - - [128, 2944, 1, 3328] - - [405, 5430.03] + - [477, 5430.03] - - [3584, 704, 1, 256] - - [438, 6154.09] + - [510, 6154.09] - - [2944, 448, 1, 3328] - - [438, 6507.82] + - [510, 6507.82] - - [3584, 1408, 1, 3328] - - [443, 8829.73] + - [515, 8829.73] - - [704, 3584, 1, 1280] - - [433, 7860.33] + - [505, 7860.33] - - [2944, 6784, 1, 1280] - - [443, 8894.6] + - [515, 8894.6] - - [1856, 6784, 1, 256] - - [443, 8115.19] + - [515, 8115.19] - - [4288, 448, 1, 3328] - - [435, 6397.35] + - [507, 6397.35] - - [6784, 4288, 1, 128] - - [421, 4109.54] + - [493, 4109.54] - - [6784, 704, 1, 1280] - - [431, 7999.14] + - [503, 7999.14] - - [256, 4288, 1, 256] - - [435, 4603.94] + - [507, 4603.94] - - [3584, 6784, 1, 256] - - [443, 7361.65] + - [515, 7361.65] - - [6144, 12000, 1, 2048] - - [442, 6311.76] + - [514, 6311.76] - - [6144, 16, 1, 2560] - - [391, 2240.65] + - [463, 2240.65] - - [3584, 64, 1, 128] - - [362, 1292.36] + - [434, 1292.36] - - [5888, 1024, 1, 3328] - - [430, 8394.59] + - [502, 8394.59] - - [448, 64, 1, 128] - - [353, 262.244] + - [425, 262.244] - - [704, 6784, 1, 1280] - - [437, 7740.66] + - [509, 7740.66] - - [4, 1024, 1, 1280] - - [390, 378.921] + - [462, 378.921] - - [5888, 128, 1, 256] - - [438, 5003.68] + - [510, 5003.68] - - [4096, 16, 1, 4096] - - [390, 1585.85] + - [462, 1585.85] - - [1856, 5056, 1, 3328] - - [431, 8522.92] + - [503, 8522.92] - - [4, 6784, 1, 256] - - [375, 387.757] + - [447, 387.757] - - [1024, 3584, 1, 128] - - [425, 3031.61] + - [497, 3031.61] - - [1024, 1408, 1, 128] - - [427, 2600.85] + - [499, 2600.85] - - [2368, 2944, 1, 128] - - [424, 4340.26] + - [496, 4340.26] - - [5056, 64, 1, 256] - - [438, 3109.62] + - [510, 3109.62] - - [4, 448, 1, 1280] - - [458, 253.835] + - [530, 253.835] - - [5056, 2944, 1, 128] - - [429, 3740.01] + - [501, 3740.01] - - [5888, 5056, 1, 3328] - - [443, 9016.48] + - [515, 9016.48] - - [1024, 704, 1, 128] - - [425, 2363.66] + - [497, 2363.66] - - [5888, 2368, 1, 128] - - [428, 3651.83] + - [500, 3651.83] - - [128, 5056, 1, 3328] - - [432, 6243.64] + - [504, 6243.64] - - [3584, 6784, 1, 1280] - - [430, 9080.67] + - [502, 9080.67] - - [448, 4, 1, 1280] - - [458, 243.083] + - [530, 243.083] - - [1856, 5888, 1, 256] - - [443, 8182.12] + - [515, 8182.12] - - [256, 256, 1, 256] - - [376, 1542.12] + - [448, 1542.12] - - [256, 64, 1, 128] - - [357, 135.226] + - [429, 135.226] - - [4288, 4288, 1, 3328] - - [443, 8674.64] + - [515, 8674.64] - - [4288, 1408, 1, 1280] - - [431, 7867.18] + - [503, 7867.18] - - [3584, 5056, 1, 128] - - [421, 4457.83] + - [493, 4457.83] - - [4, 1024, 1, 3328] - - [370, 440.394] + - [442, 440.394] - - [4288, 2368, 1, 256] - - [451, 5699.57] + - [523, 5699.57] - - [2944, 5056, 1, 1280] - - [443, 8236.56] + - [515, 8236.56] - - [448, 6784, 1, 256] - - [433, 6620.62] + - [505, 6620.62] - - [64, 128, 1, 128] - - [358, 67.6629] + - [430, 67.6629] - - [1856, 2368, 1, 128] - - [425, 4233.7] + - [497, 4233.7] - - [6784, 2368, 1, 3328] - - [443, 8269.9] + - [515, 8269.9] - - [256, 1024, 1, 1280] - - [432, 4882.88] + - [504, 4882.88] - - [704, 4, 1, 128] - - [452, 19.111] + - [524, 19.111] - - [256, 4, 1, 256] - - [390, 46.9114] + - [462, 46.9114] - - [4288, 128, 1, 256] - - [438, 4273.49] + - [510, 4273.49] - - [4288, 1856, 1, 3328] - - [433, 8195.81] + - [505, 8195.81] - - [3584, 448, 1, 128] - - [426, 2750.65] + - [498, 2750.65] - - [2048, 1600, 1, 2048] - - [449, 5753.59] + - [521, 5753.59] - - [256, 4, 1, 3328] - - [459, 297.978] + - [531, 297.978] - - [4, 1408, 1, 1280] - - [457, 402.386] + - [529, 402.386] - - [3584, 64, 1, 1280] - - [446, 4096.1] + - [518, 4096.1] - - [1408, 448, 1, 128] - - [421, 2498.25] + - [493, 2498.25] - - [3584, 1024, 1, 1280] - - [443, 7252.18] + - [515, 7252.18] - - [1856, 5056, 1, 256] - - [437, 7711.59] + - [509, 7711.59] - - [4, 3584, 1, 256] - - [454, 314.314] + - [526, 314.314] - - [4, 2944, 1, 1280] - - [390, 483.218] + - [462, 483.218] - - [1024, 4288, 1, 256] - - [442, 6544.52] + - [514, 6544.52] - - [5888, 3584, 1, 3328] - - [431, 8105.15] + - [503, 8105.15] - - [1856, 4, 1, 256] - - [390, 252.832] + - [462, 252.832] - - [4, 256, 1, 256] - - [375, 48.2882] + - [447, 48.2882] - - [5056, 3584, 1, 3328] - - [436, 7354.8] + - [508, 7354.8] - - [704, 448, 1, 128] - - [429, 1233.91] + - [501, 1233.91] - - [2368, 1408, 1, 1280] - - [437, 6654.24] + - [509, 6654.24] - - [5056, 2944, 1, 1280] - - [443, 8505.72] + - [515, 8505.72] - - [4, 4, 1, 128] - - [453, 0.1478505] + - [525, 0.1478505] - - [3584, 256, 1, 256] - - [435, 4616.47] + - [507, 4616.47] - - [1024, 6784, 1, 256] - - [437, 7944.98] + - [509, 7944.98] - - [4, 128, 1, 256] - - [390, 29.3571] + - [462, 29.3571] - - [64, 64, 1, 1280] - - [401, 642.61] + - [473, 642.61] - - [5124, 9124, 1, 2048] - - [443, 8019.4] + - [515, 8019.4] - - [6784, 4, 1, 128] - - [452, 193.067] + - [524, 193.067] - - [2944, 1408, 1, 128] - - [421, 3827.13] + - [493, 3827.13] - - [448, 128, 1, 3328] - - [394, 4064.0] + - [466, 4064.0] - - [3584, 1408, 1, 1280] - - [443, 7180.83] + - [515, 7180.83] - - [64, 4288, 1, 3328] - - [389, 4786.84] + - [461, 4786.84] - - [5056, 6784, 1, 3328] - - [430, 7889.83] + - [502, 7889.83] - - [128, 2944, 1, 256] - - [433, 3599.69] + - [505, 3599.69] - - [128, 6784, 1, 128] - - [351, 2606.79] + - [423, 2606.79] - - [3584, 4288, 1, 256] - - [437, 7299.81] + - [509, 7299.81] - - [448, 1856, 1, 256] - - [433, 5207.07] + - [505, 5207.07] - - [1856, 6784, 1, 3328] - - [435, 8386.36] + - [507, 8386.36] - - [3584, 128, 1, 3328] - - [379, 5590.04] + - [451, 5590.04] - - [64, 1856, 1, 256] - - [372, 1949.38] + - [444, 1949.38] - - [64, 448, 1, 256] - - [377, 955.833] + - [449, 955.833] - - [5888, 4288, 1, 256] - - [441, 7791.84] + - [513, 7791.84] - - [4, 448, 1, 128] - - [452, 8.84146] + - [524, 8.84146] - - [5056, 1408, 1, 256] - - [443, 5154.01] + - [515, 5154.01] - - [35, 8457, 1, 2048] - - [348, 3182.57] + - [420, 3182.57] - - [64, 256, 1, 1280] - - [397, 1713.46] + - [469, 1713.46] - - [3584, 1024, 1, 256] - - [433, 6528.18] + - [505, 6528.18] - - [256, 704, 1, 256] - - [432, 2720.46] + - [504, 2720.46] - - [5888, 5888, 1, 256] - - [441, 7992.26] + - [513, 7992.26] - - [4288, 1024, 1, 1280] - - [435, 7837.5] + - [507, 7837.5] - - [5888, 128, 1, 3328] - - [438, 7181.13] + - [510, 7181.13] - - [448, 6784, 1, 3328] - - [432, 7663.1] + - [504, 7663.1] - - [2944, 1408, 1, 1280] - - [441, 7903.14] + - [513, 7903.14] - - [64, 128, 1, 1280] - - [390, 1191.66] + - [462, 1191.66] - - [2944, 1856, 1, 3328] - - [431, 7844.41] + - [503, 7844.41] - - [2368, 64, 1, 128] - - [362, 997.973] + - [434, 997.973] - - [256, 1024, 1, 128] - - [421, 1215.84] + - [493, 1215.84] - - [3584, 5888, 1, 1280] - - [430, 8958.94] + - [502, 8958.94] - - [64, 4, 1, 128] - - [453, 1.21608] + - [525, 1.21608] - - [6784, 1856, 1, 1280] - - [430, 6728.8] + - [502, 6728.8] - - [2944, 5056, 1, 256] - - [443, 8275.21] + - [515, 8275.21] - - [4288, 4, 1, 128] - - [452, 147.644] + - [524, 147.644] - - [5888, 256, 1, 3328] - - [439, 7094.2] + - [511, 7094.2] - - [2944, 4288, 1, 128] - - [424, 4611.55] + - [496, 4611.55] - - [3584, 1408, 1, 256] - - [434, 6543.06] + - [506, 6543.06] - - [704, 3584, 1, 3328] - - [433, 8117.2] + - [505, 8117.2] - - [4096, 3200, 1, 1024] - - [448, 6656.13] + - [520, 6656.13] - - [5056, 448, 1, 1280] - - [446, 6096.2] + - [518, 6096.2] - - [3584, 1856, 1, 3328] - - [431, 8552.41] + - [503, 8552.41] - - [4288, 6784, 1, 1280] - - [437, 8212.46] + - [509, 8212.46] - - [2560, 7000, 1, 2560] - - [439, 7655.34] + - [511, 7655.34] - - [1408, 704, 1, 1280] - - [435, 5756.79] + - [507, 5756.79] - - [2944, 1024, 1, 256] - - [443, 6880.91] + - [515, 6880.91] - - [6784, 64, 1, 256] - - [438, 4438.96] + - [510, 4438.96] - - [2368, 4288, 1, 3328] - - [439, 8377.99] + - [511, 8377.99] - - [4, 1408, 1, 256] - - [456, 222.599] + - [528, 222.599] - - [1024, 1408, 1, 1280] - - [433, 6339.38] + - [505, 6339.38] - - [64, 64, 1, 256] - - [390, 187.346] + - [462, 187.346] - - [704, 256, 1, 3328] - - [432, 4046.14] + - [504, 4046.14] - - [6784, 5056, 1, 256] - - [443, 7972.17] + - [515, 7972.17] - - [1856, 1856, 1, 128] - - [427, 3716.61] + - [499, 3716.61] - - [3584, 5056, 1, 3328] - - [443, 8684.76] + - [515, 8684.76] - - [448, 6784, 1, 128] - - [425, 3829.05] + - [497, 3829.05] - - [4, 704, 1, 3328] - - [458, 393.206] + - [530, 393.206] - - [35, 8457, 1, 4096] - - [347, 3173.24] + - [419, 3173.24] - - [448, 2944, 1, 256] - - [441, 5553.41] + - [513, 5553.41] - - [4, 4288, 1, 3328] - - [400, 573.211] + - [472, 573.211] - - [2944, 6784, 1, 256] - - [437, 8566.06] + - [509, 8566.06] - - [2944, 2944, 1, 128] - - [421, 4540.83] + - [493, 4540.83] - - [4, 4, 1, 1280] - - [400, 3.14762] + - [472, 3.14762] - - [1856, 3584, 1, 1280] - - [437, 7306.36] + - [509, 7306.36] - - [64, 2944, 1, 256] - - [449, 2292.61] + - [521, 2292.61] - - [448, 256, 1, 128] - - [358, 797.93] + - [430, 797.93] - - [4288, 448, 1, 128] - - [424, 3430.5] + - [496, 3430.5] - - [4608, 24000, 1, 1536] - - [442, 6820.24] + - [514, 6820.24] - - [1856, 1408, 1, 3328] - - [445, 6600.24] + - [517, 6600.24] - - [128, 128, 1, 128] - - [350, 161.917] + - [422, 161.917] - - [1024, 4288, 1, 3328] - - [433, 7937.08] + - [505, 7937.08] - - [448, 2368, 1, 256] - - [441, 4526.45] + - [513, 4526.45] - - [1024, 4, 1, 128] - - [453, 16.9907] + - [525, 16.9907] - - [64, 1408, 1, 1280] - - [373, 3345.32] + - [445, 3345.32] - - [64, 6784, 1, 1280] - - [438, 5526.6] + - [510, 5526.6] - - [5056, 448, 1, 256] - - [432, 4216.65] + - [504, 4216.65] - - [2944, 2368, 1, 3328] - - [443, 7000.42] + - [515, 7000.42] - - [704, 4288, 1, 3328] - - [449, 6414.43] + - [521, 6414.43] - - [1408, 128, 1, 256] - - [432, 2720.46] + - [504, 2720.46] - - [1024, 1856, 1, 1280] - - [443, 7682.93] + - [515, 7682.93] - - [2048, 6400, 1, 2048] - - [439, 7418.22] + - [511, 7418.22] - - [512, 48000, 1, 2816] - - [443, 8884.77] + - [515, 8884.77] - - [5124, 9124, 1, 2560] - - [435, 6040.8] + - [507, 6040.8] - - [128, 2368, 1, 3328] - - [389, 5025.66] + - [461, 5025.66] - - [1024, 5888, 1, 256] - - [437, 7322.21] + - [509, 7322.21] - - [64, 2944, 1, 1280] - - [373, 4222.31] + - [445, 4222.31] - - [5056, 64, 1, 3328] - - [414, 4936.32] + - [486, 4936.32] - - [128, 704, 1, 128] - - [359, 683.414] + - [431, 683.414] - - [1408, 2368, 1, 256] - - [438, 6404.22] + - [510, 6404.22] - - [1408, 1408, 1, 256] - - [443, 4537.93] + - [515, 4537.93] - - [4, 64, 1, 128] - - [452, 2.56747] + - [524, 2.56747] - - [64, 1024, 1, 128] - - [351, 532.372] + - [423, 532.372] - - [1024, 8, 1, 500000] - - [341, 1685.08] + - [413, 1685.08] - - [2368, 2368, 1, 128] - - [422, 4334.33] + - [494, 4334.33] - - [64, 5888, 1, 128] - - [351, 2003.19] + - [423, 2003.19] - - [5888, 4, 1, 3328] - - [369, 339.118] + - [441, 339.118] - - [6784, 1408, 1, 128] - - [425, 4431.23] + - [497, 4431.23] - - [4288, 5888, 1, 256] - - [443, 7800.88] + - [515, 7800.88] - - [1408, 5056, 1, 256] - - [437, 8153.38] + - [509, 8153.38] - - [5056, 128, 1, 3328] - - [394, 5829.93] + - [466, 5829.93] - - [128, 128, 1, 1280] - - [397, 1691.35] + - [469, 1691.35] - - [448, 704, 1, 256] - - [438, 3364.28] + - [510, 3364.28] - - [4288, 3584, 1, 128] - - [422, 2952.68] + - [494, 2952.68] - - [2944, 128, 1, 3328] - - [394, 5620.82] + - [466, 5620.82] - - [64, 1408, 1, 3328] - - [395, 4169.91] + - [467, 4169.91] - - [3584, 5056, 1, 1280] - - [440, 7780.76] + - [512, 7780.76] - - [256, 448, 1, 1280] - - [373, 3929.45] + - [445, 3929.45] - - [704, 704, 1, 128] - - [421, 2346.17] + - [493, 2346.17] - - [5056, 4, 1, 128] - - [452, 144.557] + - [524, 144.557] - - [704, 256, 1, 1280] - - [441, 2283.22] + - [513, 2283.22] - - [64, 2368, 1, 3328] - - [373, 4921.69] + - [445, 4921.69] - - [1856, 1024, 1, 128] - - [422, 3459.57] + - [494, 3459.57] - - [1856, 64, 1, 128] - - [354, 918.237] + - [426, 918.237] - - [4096, 64, 1, 4096] - - [399, 4000.62] + - [471, 4000.62] - - [1024, 24000, 1, 1536] - - [435, 8502.36] + - [507, 8502.36] - - [704, 4288, 1, 256] - - [439, 6003.83] + - [511, 6003.83] - - [5888, 2368, 1, 1280] - - [430, 8801.3] + - [502, 8801.3] - - [128, 256, 1, 256] - - [384, 1070.08] + - [456, 1070.08] - - [64, 128, 1, 256] - - [390, 374.591] + - [462, 374.591] - - [2368, 5888, 1, 1280] - - [433, 8308.63] + - [505, 8308.63] - - [5888, 256, 1, 1280] - - [441, 7154.42] + - [513, 7154.42] - - [1760, 128, 1, 1760] - - [382, 5363.91] + - [454, 5363.91] - - [4, 5888, 1, 1280] - - [390, 542.304] + - [462, 542.304] - - [704, 128, 1, 128] - - [362, 779.447] + - [434, 779.447] - - [1024, 4, 1, 1280] - - [390, 392.531] + - [462, 392.531] - - [2368, 1856, 1, 3328] - - [433, 7975.32] + - [505, 7975.32] - - [2368, 128, 1, 128] - - [355, 1584.96] + - [427, 1584.96] - - [2944, 704, 1, 256] - - [441, 4039.21] + - [513, 4039.21] - - [5056, 128, 1, 128] - - [421, 2575.89] + - [493, 2575.89] - - [2368, 1024, 1, 3328] - - [449, 6165.54] + - [521, 6165.54] - - [256, 704, 1, 3328] - - [432, 4028.74] + - [504, 4028.74] - - [704, 3584, 1, 256] - - [443, 6102.92] + - [515, 6102.92] - - [704, 2944, 1, 3328] - - [433, 8202.84] + - [505, 8202.84] - - [6784, 1024, 1, 128] - - [425, 4386.4] + - [497, 4386.4] - - [256, 448, 1, 128] - - [362, 834.195] + - [434, 834.195] - - [448, 1024, 1, 3328] - - [450, 5412.48] + - [522, 5412.48] - - [2944, 1024, 1, 3328] - - [443, 6265.87] + - [515, 6265.87] - - [2944, 5056, 1, 128] - - [421, 4770.88] + - [493, 4770.88] - - [2368, 256, 1, 256] - - [438, 3975.23] + - [510, 3975.23] - - [1408, 6784, 1, 256] - - [437, 7987.02] + - [509, 7987.02] - - [6784, 1408, 1, 3328] - - [437, 8472.71] + - [509, 8472.71] - - [4288, 6784, 1, 128] - - [428, 3865.2] + - [500, 3865.2] - - [704, 64, 1, 256] - - [376, 1287.41] + - [448, 1287.41] - - [5888, 4, 1, 1280] - - [375, 510.022] + - [447, 510.022] - - [256, 2368, 1, 3328] - - [438, 5837.65] + - [510, 5837.65] - - [6784, 2944, 1, 1280] - - [443, 8560.54] + - [515, 8560.54] - - [4288, 1856, 1, 128] - - [421, 4617.07] + - [493, 4617.07] - - [1856, 2944, 1, 128] - - [421, 4287.73] + - [493, 4287.73] - - [6784, 448, 1, 128] - - [425, 3893.43] + - [497, 3893.43] - - [64, 3584, 1, 128] - - [351, 1609.76] + - [423, 1609.76] - - [448, 5056, 1, 1280] - - [441, 7124.41] + - [513, 7124.41] - - [2368, 1856, 1, 128] - - [424, 4004.65] + - [496, 4004.65] - - [64, 2944, 1, 3328] - - [374, 5086.48] + - [446, 5086.48] - - [4288, 704, 1, 256] - - [439, 6176.57] + - [511, 6176.57] - - [256, 3584, 1, 128] - - [422, 2553.15] + - [494, 2553.15] - - [5888, 704, 1, 256] - - [438, 6781.51] + - [510, 6781.51] - - [3584, 1024, 1, 128] - - [425, 3660.95] + - [497, 3660.95] - - [256, 5888, 1, 3328] - - [441, 7772.13] + - [513, 7772.13] - - [1408, 4288, 1, 3328] - - [437, 8832.86] + - [509, 8832.86] - - [6784, 4288, 1, 256] - - [443, 8566.14] + - [515, 8566.14] - - [4288, 256, 1, 128] - - [423, 1953.79] + - [495, 1953.79] - - [5888, 256, 1, 256] - - [441, 3730.53] + - [513, 3730.53] - - [6784, 1024, 1, 1280] - - [437, 8578.39] + - [509, 8578.39] - - [5888, 1024, 1, 128] - - [422, 4092.96] + - [494, 4092.96] - - [1024, 128, 1, 256] - - [372, 1897.98] + - [444, 1897.98] - - [512, 16, 1, 500000] - - [343, 2363.79] + - [415, 2363.79] - - [128, 64, 1, 3328] - - [400, 1592.56] + - [472, 1592.56] - - [448, 64, 1, 256] - - [390, 976.168] + - [462, 976.168] - - [2368, 256, 1, 128] - - [425, 2094.99] + - [497, 2094.99] - - [6784, 3584, 1, 1280] - - [437, 8570.16] + - [509, 8570.16] - - [1024, 6784, 1, 1280] - - [443, 8203.57] + - [515, 8203.57] - - [2944, 64, 1, 1280] - - [381, 4300.61] + - [453, 4300.61] - - [1408, 2944, 1, 1280] - - [433, 7349.64] + - [505, 7349.64] - - [256, 1856, 1, 256] - - [432, 4649.75] + - [504, 4649.75] - - [2048, 800, 1, 2048] - - [451, 4668.73] + - [523, 4668.73] - - [1408, 2368, 1, 3328] - - [441, 7537.74] + - [513, 7537.74] - - [2944, 4, 1, 3328] - - [390, 514.142] + - [462, 514.142] - - [128, 1408, 1, 3328] - - [382, 4991.64] + - [454, 4991.64] - - [2944, 1856, 1, 128] - - [421, 4317.39] + - [493, 4317.39] - - [256, 2944, 1, 128] - - [421, 2258.27] + - [493, 2258.27] - - [256, 6784, 1, 128] - - [421, 3147.02] + - [493, 3147.02] - - [2368, 4, 1, 128] - - [453, 33.9286] + - [525, 33.9286] - - [1408, 256, 1, 3328] - - [432, 5077.85] + - [504, 5077.85] - - [1856, 4, 1, 128] - - [453, 21.5025] + - [525, 21.5025] - - [5056, 6784, 1, 128] - - [421, 4945.11] + - [493, 4945.11] - - [4288, 5056, 1, 128] - - [424, 4729.87] + - [496, 4729.87] - - [1856, 5888, 1, 128] - - [421, 4707.96] + - [493, 4707.96] - - [2944, 5888, 1, 256] - - [435, 8014.78] + - [507, 8014.78] - - [3584, 1856, 1, 256] - - [437, 7567.13] + - [509, 7567.13] - - [4288, 3584, 1, 1280] - - [430, 8726.43] + - [502, 8726.43] - - [2368, 448, 1, 256] - - [438, 4227.7] + - [510, 4227.7] - - [4288, 256, 1, 3328] - - [439, 5487.41] + - [511, 5487.41] - - [1856, 704, 1, 128] - - [425, 3125.06] + - [497, 3125.06] - - [1408, 64, 1, 256] - - [385, 1620.09] + - [457, 1620.09] - - [64, 1856, 1, 128] - - [349, 955.147] + - [421, 955.147] - - [4, 256, 1, 128] - - [452, 10.8789] + - [524, 10.8789] - - [2560, 16, 1, 2560] - - [397, 2019.7] + - [469, 2019.7] - - [704, 5888, 1, 128] - - [426, 3976.26] + - [498, 3976.26] - - [6784, 3584, 1, 128] - - [425, 4018.91] + - [497, 4018.91] - - [1024, 64, 1, 256] - - [390, 1370.79] + - [462, 1370.79] - - [64, 2368, 1, 256] - - [432, 2255.76] + - [504, 2255.76] - - [4288, 5056, 1, 3328] - - [437, 8368.69] + - [509, 8368.69] - - [4, 1856, 1, 1280] - - [390, 392.126] + - [462, 392.126] - - [4288, 128, 1, 128] - - [355, 2287.03] + - [427, 2287.03] - - [1408, 1408, 1, 128] - - [425, 3233.48] + - [497, 3233.48] - - [7680, 16, 1, 2560] - - [393, 2257.37] + - [465, 2257.37] - - [1856, 128, 1, 128] - - [355, 1532.8] + - [427, 1532.8] - - [5056, 2368, 1, 256] - - [437, 8167.29] + - [509, 8167.29] - - [4288, 704, 1, 3328] - - [443, 6411.16] + - [515, 6411.16] - - [448, 3584, 1, 256] - - [443, 5477.74] + - [515, 5477.74] - - [2368, 64, 1, 1280] - - [373, 3936.52] + - [445, 3936.52] - - [2368, 1024, 1, 1280] - - [439, 7688.82] + - [511, 7688.82] - - [2944, 1408, 1, 3328] - - [430, 7668.78] + - [502, 7668.78] - - [1408, 448, 1, 256] - - [432, 4863.98] + - [504, 4863.98] - - [1024, 1408, 1, 3328] - - [441, 7448.99] + - [513, 7448.99] - - [2944, 5888, 1, 1280] - - [431, 8208.57] + - [503, 8208.57] - - [1408, 4, 1, 1280] - - [370, 479.419] + - [442, 479.419] - - [5888, 3584, 1, 256] - - [431, 8610.09] + - [503, 8610.09] - - [2368, 5056, 1, 128] - - [428, 3726.25] + - [500, 3726.25] - - [1408, 1856, 1, 3328] - - [432, 7829.48] + - [504, 7829.48] - - [4, 4, 1, 3328] - - [459, 4.39419] + - [531, 4.39419] - - [6784, 1408, 1, 1280] - - [432, 7690.8] + - [504, 7690.8] - - [4096, 7000, 1, 4096] - - [444, 6272.49] + - [516, 6272.49] - - [704, 2944, 1, 256] - - [433, 6095.91] + - [505, 6095.91] - - [4288, 64, 1, 256] - - [398, 2121.31] + - [470, 2121.31] - - [6784, 5888, 1, 3328] - - [437, 8955.6] + - [509, 8955.6] - - [2368, 4288, 1, 128] - - [421, 4699.65] + - [493, 4699.65] - - [64, 4288, 1, 1280] - - [411, 4013.73] + - [483, 4013.73] - - [6784, 64, 1, 1280] - - [432, 5418.83] + - [504, 5418.83] - - [3584, 128, 1, 128] - - [361, 2165.3] + - [433, 2165.3] - - [1024, 6784, 1, 128] - - [422, 3765.3] + - [494, 3765.3] - - [4, 1856, 1, 128] - - [453, 33.3728] + - [525, 33.3728] - - [1408, 64, 1, 3328] - - [394, 4489.51] + - [466, 4489.51] - - [6784, 4, 1, 256] - - [390, 400.262] + - [462, 400.262] - - [1408, 1408, 1, 1280] - - [437, 8139.53] + - [509, 8139.53] - - [16384, 400, 1, 4096] - - [441, 6087.28] + - [513, 6087.28] - - [256, 2368, 1, 256] - - [432, 4766.35] + - [504, 4766.35] - - [448, 4288, 1, 3328] - - [439, 7577.08] + - [511, 7577.08] - - [2368, 1408, 1, 256] - - [435, 5284.53] + - [507, 5284.53] - - [5888, 5056, 1, 128] - - [422, 3643.6] + - [494, 3643.6] - - [704, 2368, 1, 256] - - [437, 5334.73] + - [509, 5334.73] - - [1024, 24000, 1, 2560] - - [445, 7438.06] + - [517, 7438.06] - - [2944, 448, 1, 1280] - - [446, 4937.53] + - [518, 4937.53] - - [5888, 2368, 1, 3328] - - [431, 8201.84] + - [503, 8201.84] - - [5124, 9124, 1, 1760] - - [438, 6764.06] + - [510, 6764.06] - - [448, 1408, 1, 1280] - - [432, 5881.54] + - [504, 5881.54] - - [448, 1856, 1, 1280] - - [439, 6225.56] + - [511, 6225.56] - - [4288, 448, 1, 1280] - - [441, 5626.37] + - [513, 5626.37] - - [5888, 704, 1, 3328] - - [435, 7873.62] + - [507, 7873.62] - - [5056, 256, 1, 128] - - [426, 2921.03] + - [498, 2921.03] - - [1856, 256, 1, 128] - - [428, 1995.42] + - [500, 1995.42] - - [64, 1408, 1, 128] - - [349, 758.938] + - [421, 758.938] - - [704, 4, 1, 256] - - [390, 130.697] + - [462, 130.697] - - [1408, 5888, 1, 128] - - [421, 4574.05] + - [493, 4574.05] - - [7680, 12000, 1, 2560] - - [437, 8747.13] + - [509, 8747.13] - - [1408, 1024, 1, 256] - - [434, 4609.23] + - [506, 4609.23] - - [8192, 400, 1, 2048] - - [446, 5283.25] + - [518, 5283.25] - - [1024, 1856, 1, 128] - - [421, 2686.38] + - [493, 2686.38] - - [256, 704, 1, 128] - - [421, 1004.83] + - [493, 1004.83] - - [2560, 128, 1, 2560] - - [399, 4259.14] + - [471, 4259.14] - - [448, 1024, 1, 256] - - [432, 4813.24] + - [504, 4813.24] - - [128, 4, 1, 3328] - - [458, 128.408] + - [530, 128.408] - - [5056, 6784, 1, 1280] - - [440, 6579.85] + - [512, 6579.85] - - [1408, 64, 1, 128] - - [362, 819.3] + - [434, 819.3] - - [1024, 448, 1, 1280] - - [441, 5703.31] + - [513, 5703.31] - - [704, 5056, 1, 3328] - - [433, 7574.49] + - [505, 7574.49] - - [128, 5056, 1, 256] - - [432, 5113.53] + - [504, 5113.53] - - [64, 1024, 1, 3328] - - [417, 3980.1] + - [489, 3980.1] - - [1856, 4, 1, 3328] - - [371, 433.253] + - [443, 433.253] - - [4, 2944, 1, 128] - - [453, 46.6225] + - [525, 46.6225] - - [2368, 2944, 1, 3328] - - [431, 9002.13] + - [503, 9002.13] - - [448, 448, 1, 1280] - - [373, 3969.52] + - [445, 3969.52] - - [2368, 3584, 1, 256] - - [443, 7806.39] + - [515, 7806.39] - - [5056, 3584, 1, 1280] - - [430, 8971.56] + - [502, 8971.56] - - [5124, 9124, 1, 4096] - - [443, 7208.72] + - [515, 7208.72] - - [7680, 48000, 1, 2560] - - [437, 3835.91] + - [509, 3835.91] - - [448, 4, 1, 3328] - - [458, 409.7] + - [530, 409.7] - - [1856, 2944, 1, 1280] - - [430, 7173.71] + - [502, 7173.71] - - [1024, 48000, 1, 2816] - - [437, 8976.26] + - [509, 8976.26] - - [128, 1024, 1, 256] - - [376, 1969.26] + - [448, 1969.26] - - [2944, 1408, 1, 256] - - [439, 4585.12] + - [511, 4585.12] - - [4288, 1408, 1, 3328] - - [433, 8237.27] + - [505, 8237.27] - - [3584, 64, 1, 3328] - - [379, 5183.16] + - [451, 5183.16] - - [5888, 2944, 1, 128] - - [428, 3674.56] + - [500, 3674.56] - - [2944, 1024, 1, 128] - - [425, 3834.32] + - [497, 3834.32] - - [4288, 5056, 1, 1280] - - [437, 8086.1] + - [509, 8086.1] - - [5888, 6784, 1, 1280] - - [431, 6941.32] + - [503, 6941.32] - - [6784, 5056, 1, 128] - - [422, 4860.15] + - [494, 4860.15] - - [256, 1024, 1, 3328] - - [446, 5156.22] + - [518, 5156.22] - - [3584, 4, 1, 256] - - [390, 332.529] + - [462, 332.529] - - [1760, 1600, 1, 1760] - - [433, 6330.76] + - [505, 6330.76] - - [1856, 64, 1, 3328] - - [394, 4756.03] + - [466, 4756.03] - - [4, 128, 1, 3328] - - [458, 160.244] + - [530, 160.244] - - [5888, 1408, 1, 3328] - - [431, 8722.74] + - [503, 8722.74] - - [448, 2944, 1, 128] - - [424, 2997.63] + - [496, 2997.63] - - [2368, 1856, 1, 256] - - [432, 6662.34] + - [504, 6662.34] - - [256, 5056, 1, 256] - - [434, 5256.29] + - [506, 5256.29] - - [128, 3584, 1, 128] - - [353, 2073.56] + - [425, 2073.56] - - [448, 3584, 1, 3328] - - [430, 6833.96] + - [502, 6833.96] - - [4, 5056, 1, 3328] - - [400, 581.523] + - [472, 581.523] - - [704, 2368, 1, 128] - - [421, 3402.29] + - [493, 3402.29] - - [5888, 256, 1, 128] - - [426, 2977.54] + - [498, 2977.54] - - [4, 5056, 1, 128] - - [452, 65.2074] + - [524, 65.2074] - - [448, 256, 1, 256] - - [438, 1764.53] + - [510, 1764.53] - - [704, 4, 1, 3328] - - [390, 398.554] + - [462, 398.554] - - [1408, 256, 1, 256] - - [433, 3463.86] + - [505, 3463.86] - - [3584, 1856, 1, 128] - - [429, 3228.19] + - [501, 3228.19] - - [4288, 4288, 1, 128] - - [425, 4853.93] + - [497, 4853.93] - - [1856, 1024, 1, 3328] - - [449, 5994.68] + - [521, 5994.68] - - [128, 5888, 1, 3328] - - [403, 6512.85] + - [475, 6512.85] - - [1024, 5056, 1, 256] - - [443, 7859.42] + - [515, 7859.42] - - [5888, 5888, 1, 1280] - - [443, 8131.44] + - [515, 8131.44] - - [5056, 5888, 1, 128] - - [422, 4920.71] + - [494, 4920.71] - - [2368, 1408, 1, 3328] - - [441, 7110.74] + - [513, 7110.74] - - [1024, 48000, 1, 1536] - - [441, 8590.82] + - [513, 8590.82] - - [5888, 448, 1, 256] - - [442, 3567.74] + - [514, 3567.74] - - [2560, 3200, 1, 2560] - - [432, 7638.31] + - [504, 7638.31] - - [5888, 6784, 1, 128] - - [422, 3910.92] + - [494, 3910.92] - - [6144, 48000, 1, 2048] - - [443, 3412.95] + - [515, 3412.95] - - [6784, 5056, 1, 1280] - - [434, 7890.22] + - [506, 7890.22] - - [5056, 704, 1, 1280] - - [438, 7665.06] + - [510, 7665.06] - - [1024, 48000, 1, 2560] - - [443, 8188.5] + - [515, 8188.5] - - [4608, 32, 1, 1536] - - [411, 2856.97] + - [483, 2856.97] - - [1024, 2368, 1, 128] - - [421, 3019.35] + - [493, 3019.35] - - [128, 704, 1, 256] - - [372, 1696.33] + - [444, 1696.33] - - [2368, 448, 1, 3328] - - [438, 5799.29] + - [510, 5799.29] - - [128, 5888, 1, 1280] - - [432, 6680.75] + - [504, 6680.75] - - [16384, 800, 1, 4096] - - [437, 6322.22] + - [509, 6322.22] - - [448, 128, 1, 1280] - - [411, 2849.49] + - [483, 2849.49] - - [6784, 4, 1, 3328] - - [390, 563.12] + - [462, 563.12] - - [5888, 5056, 1, 1280] - - [437, 8631.33] + - [509, 8631.33] - - [1024, 64, 1, 3328] - - [412, 3481.96] + - [484, 3481.96] - - [3072, 48000, 1, 1024] - - [437, 9019.49] + - [509, 9019.49] - - [64, 3584, 1, 1280] - - [374, 4327.95] + - [446, 4327.95] - - [6784, 1408, 1, 256] - - [437, 6320.59] + - [509, 6320.59] - - [3584, 5888, 1, 128] - - [424, 4406.79] + - [496, 4406.79] - - [5056, 5888, 1, 256] - - [443, 8037.13] + - [515, 8037.13] - - [2368, 1024, 1, 256] - - [435, 4936.14] + - [507, 4936.14] - - [2944, 1856, 1, 256] - - [443, 7222.32] + - [515, 7222.32] - - [1856, 6784, 1, 1280] - - [433, 8251.81] + - [505, 8251.81] - - [64, 5056, 1, 128] - - [353, 1643.7] + - [425, 1643.7] - - [64, 6784, 1, 128] - - [351, 1929.77] + - [423, 1929.77] - - [448, 704, 1, 128] - - [423, 979.959] + - [495, 979.959] - - [4, 1024, 1, 128] - - [452, 20.1416] + - [524, 20.1416] - - [4288, 3584, 1, 256] - - [437, 8444.14] + - [509, 8444.14] - - [1408, 704, 1, 128] - - [421, 3021.0] + - [493, 3021.0] - - [64, 256, 1, 3328] - - [417, 2227.47] + - [489, 2227.47] - - [6784, 448, 1, 3328] - - [443, 6573.11] + - [515, 6573.11] - - [5056, 1856, 1, 1280] - - [435, 7976.23] + - [507, 7976.23] - - [1408, 1024, 1, 3328] - - [433, 7470.33] + - [505, 7470.33] - - [2368, 256, 1, 3328] - - [438, 5394.37] + - [510, 5394.37] - - [5888, 3584, 1, 1280] - - [430, 9031.55] + - [502, 9031.55] - - [1856, 3584, 1, 3328] - - [445, 7272.6] + - [517, 7272.6] - - [5888, 128, 1, 1280] - - [438, 6684.48] + - [510, 6684.48] - - [1024, 2944, 1, 256] - - [443, 7415.09] + - [515, 7415.09] - - [448, 6784, 1, 1280] - - [439, 7923.78] + - [511, 7923.78] - - [256, 3584, 1, 1280] - - [435, 6901.87] + - [507, 6901.87] - - [704, 5056, 1, 256] - - [440, 5004.55] + - [512, 5004.55] - - [3584, 1024, 1, 3328] - - [432, 7894.63] + - [504, 7894.63] - - [2944, 1856, 1, 1280] - - [437, 7903.27] + - [509, 7903.27] - - [128, 256, 1, 128] - - [350, 325.745] + - [422, 325.745] - - [5056, 256, 1, 256] - - [434, 3356.56] + - [506, 3356.56] - - [2944, 4288, 1, 3328] - - [443, 7813.93] + - [515, 7813.93] - - [2368, 3584, 1, 3328] - - [443, 8371.09] + - [515, 8371.09] - - [2944, 704, 1, 1280] - - [449, 5514.09] + - [521, 5514.09] - - [128, 4, 1, 256] - - [390, 25.3062] + - [462, 25.3062] - - [2944, 3584, 1, 1280] - - [437, 7738.83] + - [509, 7738.83] - - [1856, 5888, 1, 1280] - - [431, 8584.63] + - [503, 8584.63] - - [256, 256, 1, 1280] - - [411, 2962.18] + - [483, 2962.18] - - [2048, 3200, 1, 2048] - - [439, 6911.69] + - [511, 6911.69] - - [4288, 1408, 1, 256] - - [437, 7954.0] + - [509, 7954.0] - - [3584, 64, 1, 256] - - [438, 2780.42] + - [510, 2780.42] - - [64, 1856, 1, 3328] - - [373, 4912.04] + - [445, 4912.04] - - [256, 1408, 1, 128] - - [421, 1373.24] + - [493, 1373.24] - - [5888, 1408, 1, 128] - - [426, 4242.01] + - [498, 4242.01] - - [4288, 2368, 1, 1280] - - [435, 8012.7] + - [507, 8012.7] - - [4, 4288, 1, 256] - - [456, 301.674] + - [528, 301.674] - - [256, 4288, 1, 128] - - [421, 2706.36] + - [493, 2706.36] - - [2048, 128, 1, 2048] - - [416, 2885.26] + - [488, 2885.26] - - [256, 128, 1, 3328] - - [418, 3170.21] + - [490, 3170.21] - - [512, 8, 1, 500000] - - [342, 1915.12] + - [414, 1915.12] - - [6784, 2368, 1, 256] - - [437, 8323.66] + - [509, 8323.66] - - [5888, 128, 1, 128] - - [425, 2466.08] + - [497, 2466.08] - - [1024, 24000, 1, 2816] - - [435, 8131.64] + - [507, 8131.64] - - [7680, 5984, 1, 2560] - - [439, 6040.77] + - [511, 6040.77] - - [4288, 1856, 1, 256] - - [451, 5818.53] + - [523, 5818.53] - - [1856, 256, 1, 3328] - - [432, 6532.03] + - [504, 6532.03] - - [1856, 2944, 1, 256] - - [437, 7312.92] + - [509, 7312.92] - - [5056, 1024, 1, 128] - - [427, 4103.0] + - [499, 4103.0] - - [64, 5888, 1, 1280] - - [432, 5058.25] + - [504, 5058.25] - - [1760, 800, 1, 1760] - - [435, 7280.0] + - [507, 7280.0] - - [6784, 256, 1, 128] - - [425, 3257.69] + - [497, 3257.69] - - [5888, 704, 1, 128] - - [421, 3813.93] + - [493, 3813.93] - - [1408, 2368, 1, 128] - - [422, 3561.27] + - [494, 3561.27] - - [1024, 4288, 1, 1280] - - [441, 7752.74] + - [513, 7752.74] - - [2368, 5056, 1, 3328] - - [444, 7711.91] + - [516, 7711.91] - - [448, 4, 1, 128] - - [452, 18.4795] + - [524, 18.4795] - - [4, 256, 1, 3328] - - [459, 269.71] + - [531, 269.71] - - [4288, 1024, 1, 3328] - - [438, 7910.27] + - [510, 7910.27] - - [6144, 48000, 1, 2560] - - [437, 3541.09] + - [509, 3541.09] - - [1024, 5056, 1, 3328] - - [431, 8509.66] + - [503, 8509.66] - - [1024, 1856, 1, 3328] - - [437, 7907.93] + - [509, 7907.93] - - [704, 704, 1, 1280] - - [449, 5648.15] + - [521, 5648.15] - - [128, 2368, 1, 1280] - - [408, 4145.11] + - [480, 4145.11] - - [1408, 128, 1, 3328] - - [381, 4919.6] + - [453, 4919.6] - - [3584, 256, 1, 1280] - - [433, 5185.56] + - [505, 5185.56] - - [4, 128, 1, 128] - - [452, 3.07891] + - [524, 3.07891] - - [5888, 64, 1, 1280] - - [381, 4499.59] + - [453, 4499.59] - - [3584, 128, 1, 1280] - - [438, 5929.01] + - [510, 5929.01] - - [4, 256, 1, 1280] - - [457, 170.767] + - [529, 170.767] - - [128, 704, 1, 3328] - - [381, 4379.37] + - [453, 4379.37] - - [4288, 6784, 1, 256] - - [431, 7181.09] + - [503, 7181.09] - - [3584, 2944, 1, 3328] - - [437, 8553.3] + - [509, 8553.3] - - [128, 1856, 1, 256] - - [438, 3207.77] + - [510, 3207.77] - - [64, 4288, 1, 256] - - [432, 2907.99] + - [504, 2907.99] - - [4, 3584, 1, 3328] - - [390, 560.605] + - [462, 560.605] - - [64, 4, 1, 3328] - - [459, 67.5025] + - [531, 67.5025] - - [4, 64, 1, 3328] - - [459, 88.8467] + - [531, 88.8467] - - [5888, 2944, 1, 256] - - [437, 7255.77] + - [509, 7255.77] - - [1856, 64, 1, 256] - - [383, 1743.72] + - [455, 1743.72] - - [5056, 128, 1, 1280] - - [438, 6009.79] + - [510, 6009.79] - - [448, 4288, 1, 1280] - - [439, 6466.82] + - [511, 6466.82] - - [448, 1856, 1, 3328] - - [439, 6381.99] + - [511, 6381.99] - - [1024, 4288, 1, 128] - - [424, 3491.87] + - [496, 3491.87] - - [4, 1024, 1, 256] - - [457, 172.563] + - [529, 172.563] - - [5056, 4288, 1, 256] - - [437, 8241.52] + - [509, 8241.52] - - [1024, 448, 1, 256] - - [441, 4218.51] + - [513, 4218.51] - - [1024, 3584, 1, 256] - - [437, 6513.69] + - [509, 6513.69] - - [2944, 128, 1, 1280] - - [381, 4710.48] + - [453, 4710.48] - - [2048, 32, 1, 2048] - - [396, 1779.23] + - [468, 1779.23] - - [64, 256, 1, 256] - - [390, 655.46] + - [462, 655.46] - - [1408, 4, 1, 128] - - [453, 20.1249] + - [525, 20.1249] - - [128, 2368, 1, 128] - - [353, 1707.73] + - [425, 1707.73] - - [256, 704, 1, 1280] - - [432, 3735.31] + - [504, 3735.31] - - [64, 2368, 1, 128] - - [360, 1049.81] + - [432, 1049.81] - - [6784, 6784, 1, 3328] - - [437, 9277.94] + - [509, 9277.94] - - [448, 5888, 1, 1280] - - [443, 7319.75] + - [515, 7319.75] - - [5056, 448, 1, 128] - - [425, 3694.43] + - [497, 3694.43] - - [4288, 704, 1, 1280] - - [435, 7890.96] + - [507, 7890.96] - - [3584, 2944, 1, 128] - - [427, 4124.71] + - [499, 4124.71] - - [6784, 256, 1, 1280] - - [443, 7185.83] + - [515, 7185.83] - - [256, 2944, 1, 1280] - - [432, 6736.76] + - [504, 6736.76] - - [64, 4288, 1, 128] - - [351, 1614.41] + - [423, 1614.41] - - [2368, 5888, 1, 3328] - - [433, 8616.46] + - [505, 8616.46] - - [4, 64, 1, 256] - - [370, 11.4778] + - [442, 11.4778] - - [704, 1024, 1, 3328] - - [438, 6801.92] + - [510, 6801.92] - - [2368, 1856, 1, 1280] - - [435, 7853.57] + - [507, 7853.57] - - [448, 5056, 1, 3328] - - [438, 7453.04] + - [510, 7453.04] - - [128, 448, 1, 128] - - [353, 530.449] + - [425, 530.449] - - [128, 6784, 1, 256] - - [433, 5557.55] + - [505, 5557.55] - - [3584, 4288, 1, 128] - - [424, 4462.73] + - [496, 4462.73] - - [64, 448, 1, 128] - - [353, 278.132] + - [425, 278.132] - - [5888, 4288, 1, 3328] - - [430, 9153.55] + - [502, 9153.55] - - [2368, 704, 1, 256] - - [437, 5350.78] + - [509, 5350.78] - - [256, 1856, 1, 3328] - - [432, 6536.35] + - [504, 6536.35] - - [1856, 128, 1, 256] - - [446, 2847.36] + - [518, 2847.36] - - [6784, 128, 1, 128] - - [426, 2530.82] + - [498, 2530.82] - - [3584, 1408, 1, 128] - - [427, 3625.62] + - [499, 3625.62] - - [1856, 5056, 1, 1280] - - [433, 8123.39] + - [505, 8123.39] - - [2944, 1024, 1, 1280] - - [443, 8450.41] + - [515, 8450.41] - - [5056, 4, 1, 256] - - [457, 380.787] + - [529, 380.787] - - [3584, 5888, 1, 3328] - - [435, 8567.99] + - [507, 8567.99] - - [2368, 4288, 1, 256] - - [439, 7858.07] + - [511, 7858.07] - - [1024, 2368, 1, 3328] - - [433, 6776.45] + - [505, 6776.45] - - [64, 704, 1, 3328] - - [388, 3503.52] + - [460, 3503.52] - - [704, 1408, 1, 256] - - [433, 6099.99] + - [505, 6099.99] - - [4096, 128, 1, 4096] - - [413, 4116.57] + - [485, 4116.57] - - [1024, 3584, 1, 1280] - - [443, 7231.65] + - [515, 7231.65] - - [4288, 5888, 1, 3328] - - [437, 8762.42] + - [509, 8762.42] - - [4288, 4, 1, 1280] - - [390, 492.797] + - [462, 492.797] - - [4608, 16, 1, 1536] - - [391, 1892.58] + - [463, 1892.58] - - [5888, 64, 1, 128] - - [368, 1747.73] + - [440, 1747.73] - - [4, 5888, 1, 128] - - [453, 84.5915] + - [525, 84.5915] - - [1024, 2944, 1, 3328] - - [441, 6907.05] + - [513, 6907.05] - - [6784, 1856, 1, 256] - - [437, 6274.07] + - [509, 6274.07] - - [2048, 64, 1, 2048] - - [420, 2371.44] + - [492, 2371.44] - - [256, 6784, 1, 1280] - - [437, 7067.04] + - [509, 7067.04] - - [1856, 3584, 1, 256] - - [443, 7706.87] + - [515, 7706.87] - - [128, 448, 1, 3328] - - [388, 3995.93] + - [460, 3995.93] - - [6784, 1856, 1, 128] - - [425, 4459.09] + - [497, 4459.09] - - [4, 448, 1, 256] - - [390, 84.4294] + - [462, 84.4294] - - [5056, 128, 1, 256] - - [438, 4954.5] + - [510, 4954.5] - - [512, 24000, 1, 2816] - - [431, 8994.98] + - [503, 8994.98] - - [256, 5888, 1, 1280] - - [430, 6184.0] + - [502, 6184.0] - - [4, 128, 1, 1280] - - [458, 71.9597] + - [530, 71.9597] - - [16384, 1600, 1, 4096] - - [437, 6921.09] + - [509, 6921.09] - - [6784, 128, 1, 1280] - - [441, 6486.37] + - [513, 6486.37] - - [64, 1408, 1, 256] - - [378, 1647.86] + - [450, 1647.86] - - [2368, 1408, 1, 128] - - [425, 3937.1] + - [497, 3937.1] - - [1856, 448, 1, 256] - - [438, 4635.57] + - [510, 4635.57] - - [1408, 1024, 1, 128] - - [421, 3208.51] + - [493, 3208.51] - - [128, 64, 1, 128] - - [350, 70.192] + - [422, 70.192] - - [6784, 3584, 1, 3328] - - [443, 8466.28] + - [515, 8466.28] - - [1760, 7000, 1, 1760] - - [441, 8149.21] + - [513, 8149.21] - - [2944, 64, 1, 3328] - - [374, 5018.09] + - [446, 5018.09] - - [64, 64, 1, 128] - - [350, 35.5249] + - [422, 35.5249] - - [2368, 5056, 1, 1280] - - [437, 8764.0] + - [509, 8764.0] - - [64, 4, 1, 1280] - - [459, 43.6745] + - [531, 43.6745] - - [1408, 2368, 1, 1280] - - [438, 7660.38] + - [510, 7660.38] - - [128, 1408, 1, 1280] - - [373, 4185.27] + - [445, 4185.27] - - [256, 64, 1, 3328] - - [398, 2071.75] + - [470, 2071.75] - - [704, 4288, 1, 128] - - [421, 4069.18] + - [493, 4069.18] - - [128, 1856, 1, 3328] - - [404, 5776.15] + - [476, 5776.15] - - [2944, 2944, 1, 256] - - [443, 7949.31] + - [515, 7949.31] - - [2944, 4, 1, 1280] - - [390, 483.218] + - [462, 483.218] - - [5888, 4, 1, 256] - - [375, 396.765] + - [447, 396.765] - - [6784, 256, 1, 256] - - [449, 4044.83] + - [521, 4044.83] - - [256, 5056, 1, 3328] - - [432, 7607.37] + - [504, 7607.37] - - [128, 4288, 1, 1280] - - [373, 4958.78] + - [445, 4958.78] - - [5056, 1856, 1, 128] - - [425, 4560.94] + - [497, 4560.94] - - [5056, 1024, 1, 3328] - - [437, 8634.18] + - [509, 8634.18] - - [128, 128, 1, 256] - - [375, 699.151] + - [447, 699.151] - - [1760, 64, 1, 1760] - - [381, 4580.65] + - [453, 4580.65] - - [4288, 3584, 1, 3328] - - [443, 9143.76] + - [515, 9143.76] - - [448, 704, 1, 3328] - - [432, 4473.43] + - [504, 4473.43] - - [448, 448, 1, 128] - - [363, 1264.38] + - [435, 1264.38] - - [1024, 2368, 1, 1280] - - [441, 7452.51] + - [513, 7452.51] - - [1856, 704, 1, 3328] - - [432, 6103.34] + - [504, 6103.34] - - [4, 2368, 1, 128] - - [452, 96.019] + - [524, 96.019] - - [5888, 6784, 1, 3328] - - [437, 9131.74] + - [509, 9131.74] - - [704, 4288, 1, 1280] - - [439, 7906.46] + - [511, 7906.46] - - [704, 256, 1, 256] - - [432, 2772.78] + - [504, 2772.78] - - [1024, 48000, 1, 2048] - - [436, 6513.45] + - [508, 6513.45] - - [4288, 1024, 1, 128] - - [421, 4291.77] + - [493, 4291.77] - - [512, 2048, 1, 49] - - [467, 4555.08] + - [539, 4555.08] - - [512, 128, 1, 784] - - [460, 3195.39] + - [532, 3195.39] - - [2048, 512, 1, 49] - - [468, 4253.43] + - [540, 4253.43] - - [1024, 256, 1, 196] - - [464, 4039.43] + - [536, 4039.43] - - [256, 64, 1, 3136] - - [462, 3015.37] + - [534, 3015.37] - - [256, 1024, 1, 196] - - [466, 4225.45] + - [538, 4225.45] - - [64, 256, 1, 3136] - - [463, 3058.45] + - [535, 3058.45] - - [128, 512, 1, 784] - - [461, 3380.38] + - [533, 3380.38] - - [64, 64, 1, 3136] - - [465, 1372.44] + - [537, 1372.44] - - [1024, 1024, 1, 3328] - - [578, 8705.1] + - [650, 8705.1] - - [2048, 200, 1, 3200] - - [583, 6173.42] + - [655, 6173.42] - - [1024, 200, 1, 13312] - - [481, 5213.31] + - [553, 5213.31] - - [1024, 256, 1, 1536] - - [583, 5859.43] + - [655, 5859.43] - - [4096, 256, 1, 12288] - - [588, 8807.52] + - [660, 8807.52] - - [64, 200, 1, 1024] - - [555, 366.632] + - [627, 366.632] - - [32, 512, 1, 1024] - - [510, 453.049] + - [582, 453.049] - - [2048, 256, 1, 3328] - - [572, 7876.73] + - [644, 7876.73] - - [4096, 512, 1, 32] - - [576, 3975.74] + - [648, 3975.74] - - [2048, 256, 1, 13312] - - [553, 7837.81] + - [625, 7837.81] - - [4096, 200, 1, 11264] - - [588, 6902.76] + - [660, 6902.76] - - [2048, 512, 1, 1024] - - [582, 8100.14] + - [654, 8100.14] - - [2048, 1024, 1, 1664] - - [482, 9082.08] + - [554, 9082.08] - - [1024, 1024, 1, 64] - - [578, 4258.28] + - [650, 4258.28] - - [512, 1024, 1, 1536] - - [572, 7597.33] + - [644, 7597.33] - - [1024, 256, 1, 15360] - - [473, 6735.24] + - [545, 6735.24] - - [1, 512, 1, 1024] - - [523, 15.1657] + - [595, 15.1657] - - [4096, 512, 1, 1408] - - [485, 9024.52] + - [557, 9024.52] - - [1024, 200, 1, 1408] - - [583, 4461.09] + - [655, 4461.09] - - [1024, 512, 1, 512] - - [577, 6528.2] + - [649, 6528.2] - - [4096, 256, 1, 15360] - - [584, 8824.03] + - [656, 8824.03] - - [2048, 512, 1, 640] - - [574, 7989.25] + - [646, 7989.25] - - [4096, 1024, 1, 1280] - - [480, 9421.54] + - [552, 9421.54] - - [1024, 200, 1, 6144] - - [572, 4966.52] + - [644, 4966.52] - - [1024, 1024, 1, 512] - - [574, 7731.54] + - [646, 7731.54] - - [128, 512, 1, 2048] - - [490, 2190.34] + - [562, 2190.34] - - [2048, 1024, 1, 640] - - [480, 8581.8] + - [552, 8581.8] - - [1024, 256, 1, 3328] - - [572, 6192.71] + - [644, 6192.71] - - [4096, 1024, 1, 13312] - - [485, 9642.59] + - [557, 9642.59] - - [2048, 256, 1, 2048] - - [572, 7485.75] + - [644, 7485.75] - - [2048, 1024, 1, 13312] - - [485, 9352.26] + - [557, 9352.26] - - [2048, 512, 1, 16640] - - [573, 8839.17] + - [645, 8839.17] - - [1024, 512, 1, 128] - - [577, 4280.0] + - [649, 4280.0] - - [2048, 1024, 1, 3584] - - [480, 9264.72] + - [552, 9264.72] - - [2048, 512, 1, 256] - - [588, 6990.61] + - [660, 6990.61] - - [512, 256, 1, 3200] - - [535, 4154.52] + - [607, 4154.52] - - [4096, 1024, 1, 1920] - - [480, 9535.32] + - [552, 9535.32] - - [4096, 200, 1, 2560] - - [585, 6754.65] + - [657, 6754.65] - - [1024, 256, 1, 16384] - - [475, 6289.6] + - [547, 6289.6] - - [1024, 1024, 1, 1152] - - [578, 8407.39] + - [650, 8407.39] - - [2048, 200, 1, 32] - - [521, 1412.51] + - [593, 1412.51] - - [512, 1024, 1, 2816] - - [572, 7843.25] + - [644, 7843.25] - - [4096, 256, 1, 14336] - - [584, 8844.77] + - [656, 8844.77] - - [1024, 200, 1, 4608] - - [583, 4931.74] + - [655, 4931.74] - - [1024, 200, 1, 16384] - - [478, 5135.15] + - [550, 5135.15] - - [64, 256, 1, 1024] - - [556, 461.013] + - [628, 461.013] - - [1, 200, 1, 1024] - - [538, 7.49884] + - [610, 7.49884] - - [2048, 200, 1, 2080] - - [583, 6033.87] + - [655, 6033.87] - - [512, 256, 1, 1792] - - [493, 3153.71] + - [565, 3153.71] - - [2048, 200, 1, 1024] - - [583, 5711.3] + - [655, 5711.3] - - [4096, 1024, 1, 12288] - - [480, 9658.23] + - [552, 9658.23] - - [4096, 200, 1, 4096] - - [574, 6834.55] + - [646, 6834.55] - - [1024, 512, 1, 11264] - - [541, 7686.46] + - [613, 7686.46] - - [128, 512, 1, 1024] - - [511, 1458.99] + - [583, 1458.99] - - [32, 256, 1, 2048] - - [529, 384.899] + - [601, 384.899] - - [1024, 200, 1, 1792] - - [583, 4638.64] + - [655, 4638.64] - - [1024, 1024, 1, 1792] - - [578, 8550.56] + - [650, 8550.56] - - [32, 256, 1, 512] - - [562, 161.419] + - [634, 161.419] - - [512, 200, 1, 2816] - - [488, 3353.1] + - [560, 3353.1] - - [512, 200, 1, 3072] - - [473, 3298.89] + - [545, 3298.89] - - [1024, 1024, 1, 8192] - - [519, 8369.1] + - [591, 8369.1] - - [1024, 256, 1, 12288] - - [476, 6475.71] + - [548, 6475.71] - - [4096, 200, 1, 768] - - [578, 6367.97] + - [650, 6367.97] - - [1024, 512, 1, 16384] - - [594, 7367.12] + - [666, 7367.12] - - [4096, 256, 1, 1024] - - [574, 8214.16] + - [646, 8214.16] - - [1024, 512, 1, 256] - - [577, 5537.13] + - [649, 5537.13] - - [4096, 1024, 1, 8320] - - [480, 9674.26] + - [552, 9674.26] - - [4096, 256, 1, 9216] - - [582, 8791.02] + - [654, 8791.02] - - [1024, 512, 1, 1408] - - [572, 7459.65] + - [644, 7459.65] - - [1024, 512, 1, 5632] - - [583, 7997.91] + - [655, 7997.91] - - [4096, 200, 1, 256] - - [588, 5371.9] + - [660, 5371.9] - - [1024, 200, 1, 128] - - [566, 1998.15] + - [638, 1998.15] - - [256, 200, 1, 1024] - - [535, 1196.01] + - [607, 1196.01] - - [1024, 200, 1, 5120] - - [583, 4957.44] + - [655, 4957.44] - - [512, 1024, 1, 3072] - - [596, 7104.07] + - [668, 7104.07] - - [4096, 1024, 1, 15360] - - [480, 9669.04] + - [552, 9669.04] - - [1, 256, 1, 2048] - - [522, 13.9262] + - [594, 13.9262] - - [1024, 1024, 1, 4160] - - [574, 8759.3] + - [646, 8759.3] - - [1024, 256, 1, 256] - - [581, 3728.37] + - [653, 3728.37] - - [2048, 256, 1, 384] - - [583, 6123.17] + - [655, 6123.17] - - [512, 256, 1, 2560] - - [537, 3809.64] + - [609, 3809.64] - - [4096, 512, 1, 3072] - - [485, 9215.19] + - [557, 9215.19] - - [1024, 256, 1, 4160] - - [572, 6293.49] + - [644, 6293.49] - - [4096, 512, 1, 13312] - - [482, 9367.32] + - [554, 9367.32] - - [4096, 1024, 1, 3840] - - [480, 9631.57] + - [552, 9631.57] - - [4096, 200, 1, 640] - - [578, 6206.16] + - [650, 6206.16] - - [32, 200, 1, 2048] - - [516, 303.507] + - [588, 303.507] - - [1024, 200, 1, 512] - - [572, 3713.19] + - [644, 3713.19] - - [1024, 1024, 1, 7168] - - [575, 8475.74] + - [647, 8475.74] - - [2048, 1024, 1, 3200] - - [480, 9271.34] + - [552, 9271.34] - - [512, 512, 1, 1536] - - [583, 5832.27] + - [655, 5832.27] - - [4096, 256, 1, 768] - - [588, 8066.07] + - [660, 8066.07] - - [2048, 256, 1, 6656] - - [572, 8034.87] + - [644, 8034.87] - - [1024, 256, 1, 896] - - [572, 5467.54] + - [644, 5467.54] - - [2048, 256, 1, 512] - - [583, 6465.31] + - [655, 6465.31] - - [2048, 200, 1, 3072] - - [583, 6165.78] + - [655, 6165.78] - - [128, 200, 1, 1024] - - [540, 692.87] + - [612, 692.87] - - [4096, 512, 1, 3840] - - [485, 9272.7] + - [557, 9272.7] - - [1024, 200, 1, 3200] - - [583, 4838.85] + - [655, 4838.85] - - [4096, 512, 1, 5632] - - [480, 9335.52] + - [552, 9335.52] - - [4096, 512, 1, 64] - - [515, 5275.95] + - [587, 5275.95] - - [1024, 512, 1, 2816] - - [572, 7816.68] + - [644, 7816.68] - - [4096, 256, 1, 7680] - - [578, 8795.5] + - [650, 8795.5] - - [4096, 200, 1, 1024] - - [588, 6448.91] + - [660, 6448.91] - - [1024, 512, 1, 12288] - - [542, 7624.67] + - [614, 7624.67] - - [2048, 1024, 1, 512] - - [485, 8436.16] + - [557, 8436.16] - - [128, 256, 1, 2048] - - [559, 1342.28] + - [631, 1342.28] - - [2048, 200, 1, 1792] - - [583, 6020.47] + - [655, 6020.47] - - [1024, 1024, 1, 2816] - - [574, 8670.5] + - [646, 8670.5] - - [2048, 512, 1, 1536] - - [585, 8466.32] + - [657, 8466.32] - - [4096, 256, 1, 3072] - - [582, 8631.47] + - [654, 8631.47] - - [1024, 200, 1, 1536] - - [564, 4577.7] + - [636, 4577.7] - - [1024, 256, 1, 1024] - - [572, 5491.82] + - [644, 5491.82] - - [4096, 512, 1, 8192] - - [485, 9325.64] + - [557, 9325.64] - - [128, 1024, 1, 512] - - [583, 2534.42] + - [655, 2534.42] - - [4096, 512, 1, 2304] - - [480, 9193.09] + - [552, 9193.09] - - [2048, 256, 1, 5632] - - [583, 7999.64] + - [655, 7999.64] - - [1024, 256, 1, 5120] - - [583, 6307.32] + - [655, 6307.32] - - [1024, 512, 1, 6656] - - [583, 8028.95] + - [655, 8028.95] - - [4096, 512, 1, 2816] - - [480, 9234.5] + - [552, 9234.5] - - [4096, 200, 1, 2080] - - [567, 6697.96] + - [639, 6697.96] - - [1024, 200, 1, 2304] - - [583, 4752.91] + - [655, 4752.91] - - [2048, 200, 1, 13312] - - [572, 6346.23] + - [644, 6346.23] - - [64, 1024, 1, 1024] - - [556, 1359.68] + - [628, 1359.68] - - [4096, 256, 1, 3584] - - [578, 8668.9] + - [650, 8668.9] - - [2048, 1024, 1, 7680] - - [480, 9365.88] + - [552, 9365.88] - - [1024, 256, 1, 1664] - - [572, 5907.57] + - [644, 5907.57] - - [1, 512, 1, 2048] - - [499, 23.5057] + - [571, 23.5057] - - [512, 512, 1, 1024] - - [572, 5360.23] + - [644, 5360.23] - - [2048, 256, 1, 8192] - - [544, 7665.31] + - [616, 7665.31] - - [2048, 512, 1, 512] - - [574, 7767.33] + - [646, 7767.33] - - [4096, 512, 1, 1920] - - [480, 9133.04] + - [552, 9133.04] - - [4096, 200, 1, 12288] - - [588, 6910.75] + - [660, 6910.75] - - [1024, 512, 1, 3072] - - [518, 7310.43] + - [590, 7310.43] - - [2048, 512, 1, 1152] - - [578, 8342.36] + - [650, 8342.36] - - [1024, 256, 1, 2080] - - [572, 6010.46] + - [644, 6010.46] - - [4096, 1024, 1, 32] - - [568, 4793.59] + - [640, 4793.59] - - [4096, 512, 1, 16640] - - [480, 9365.41] + - [552, 9365.41] - - [2048, 200, 1, 9216] - - [572, 6315.98] + - [644, 6315.98] - - [2048, 200, 1, 2560] - - [572, 6119.24] + - [644, 6119.24] - - [2048, 1024, 1, 1024] - - [480, 8628.69] + - [552, 8628.69] - - [2048, 256, 1, 4608] - - [572, 7951.39] + - [644, 7951.39] - - [512, 200, 1, 768] - - [524, 2132.51] + - [596, 2132.51] - - [128, 256, 1, 512] - - [524, 670.117] + - [596, 670.117] - - [4096, 512, 1, 1792] - - [485, 9127.01] + - [557, 9127.01] - - [4096, 1024, 1, 8192] - - [480, 9591.37] + - [552, 9591.37] - - [1024, 256, 1, 2816] - - [583, 6119.11] + - [655, 6119.11] - - [1024, 1024, 1, 13312] - - [575, 8529.37] + - [647, 8529.37] - - [2048, 1024, 1, 4160] - - [480, 9305.67] + - [552, 9305.67] - - [2048, 256, 1, 3584] - - [572, 7903.23] + - [644, 7903.23] - - [128, 200, 1, 2048] - - [540, 1135.91] + - [612, 1135.91] - - [4096, 512, 1, 10240] - - [482, 9339.59] + - [554, 9339.59] - - [4096, 512, 1, 512] - - [480, 8446.78] + - [552, 8446.78] - - [2048, 1024, 1, 6656] - - [480, 9331.75] + - [552, 9331.75] - - [1024, 512, 1, 640] - - [572, 6776.04] + - [644, 6776.04] - - [2048, 512, 1, 768] - - [574, 8085.51] + - [646, 8085.51] - - [2048, 200, 1, 1408] - - [572, 5880.17] + - [644, 5880.17] - - [4096, 200, 1, 2048] - - [588, 6691.71] + - [660, 6691.71] - - [1024, 1024, 1, 5632] - - [574, 8749.63] + - [646, 8749.63] - - [2048, 512, 1, 3584] - - [578, 8704.23] + - [650, 8704.23] - - [64, 512, 1, 512] - - [514, 667.983] + - [586, 667.983] - - [64, 200, 1, 512] - - [524, 251.388] + - [596, 251.388] - - [1024, 200, 1, 64] - - [479, 1310.82] + - [551, 1310.82] - - [512, 512, 1, 2304] - - [572, 6078.8] + - [644, 6078.8] - - [2048, 1024, 1, 14336] - - [480, 9321.94] + - [552, 9321.94] - - [4096, 512, 1, 11264] - - [482, 9339.95] + - [554, 9339.95] - - [4096, 512, 1, 128] - - [567, 6566.53] + - [639, 6566.53] - - [1024, 512, 1, 64] - - [587, 2953.84] + - [659, 2953.84] - - [4096, 512, 1, 768] - - [480, 8738.23] + - [552, 8738.23] - - [4096, 1024, 1, 11264] - - [480, 9637.78] + - [552, 9637.78] - - [1, 256, 1, 1024] - - [570, 8.93234] + - [642, 8.93234] - - [4096, 200, 1, 7680] - - [567, 6889.57] + - [639, 6889.57] - - [1024, 200, 1, 12288] - - [539, 5237.74] + - [611, 5237.74] - - [1024, 1024, 1, 1280] - - [574, 8418.17] + - [646, 8418.17] - - [4096, 1024, 1, 16640] - - [480, 9675.01] + - [552, 9675.01] - - [2048, 1024, 1, 5632] - - [480, 9327.85] + - [552, 9327.85] - - [1024, 200, 1, 15360] - - [539, 5386.63] + - [611, 5386.63] - - [1, 1024, 1, 1024] - - [589, 27.3499] + - [661, 27.3499] - - [2048, 256, 1, 16384] - - [550, 7652.75] + - [622, 7652.75] - - [4096, 512, 1, 12288] - - [482, 9359.51] + - [554, 9359.51] - - [2048, 200, 1, 896] - - [583, 5628.96] + - [655, 5628.96] - - [4096, 1024, 1, 5632] - - [480, 9626.78] + - [552, 9626.78] - - [2048, 256, 1, 32] - - [576, 1889.43] + - [648, 1889.43] - - [2048, 256, 1, 1280] - - [572, 7390.94] + - [644, 7390.94] - - [4096, 256, 1, 4096] - - [574, 8694.37] + - [646, 8694.37] - - [2048, 256, 1, 11264] - - [572, 8113.95] + - [644, 8113.95] - - [4096, 200, 1, 9216] - - [574, 6891.08] + - [646, 6891.08] - - [1024, 512, 1, 4096] - - [520, 7348.46] + - [592, 7348.46] - - [2048, 1024, 1, 10240] - - [482, 9095.91] + - [554, 9095.91] - - [4096, 1024, 1, 640] - - [480, 9115.68] + - [552, 9115.68] - - [128, 1024, 1, 2048] - - [473, 3270.51] + - [545, 3270.51] - - [4096, 200, 1, 3840] - - [567, 6836.26] + - [639, 6836.26] - - [1024, 1024, 1, 1920] - - [578, 8562.82] + - [650, 8562.82] - - [2048, 200, 1, 7168] - - [583, 6296.23] + - [655, 6296.23] - - [2048, 512, 1, 16384] - - [474, 8632.51] + - [546, 8632.51] - - [2048, 1024, 1, 12288] - - [480, 9158.08] + - [552, 9158.08] - - [4096, 1024, 1, 10240] - - [480, 9658.84] + - [552, 9658.84] - - [1024, 1024, 1, 8320] - - [582, 8799.58] + - [654, 8799.58] - - [1024, 256, 1, 9216] - - [572, 6375.23] + - [644, 6375.23] - - [4096, 256, 1, 1152] - - [567, 8301.09] + - [639, 8301.09] - - [512, 200, 1, 2560] - - [533, 3088.51] + - [605, 3088.51] - - [2048, 256, 1, 1920] - - [572, 7714.94] + - [644, 7714.94] - - [2048, 1024, 1, 4608] - - [480, 9305.7] + - [552, 9305.7] - - [512, 256, 1, 1024] - - [580, 2887.74] + - [652, 2887.74] - - [1024, 256, 1, 1920] - - [564, 5913.12] + - [636, 5913.12] - - [4096, 512, 1, 3584] - - [480, 9275.69] + - [552, 9275.69] - - [2048, 512, 1, 4160] - - [585, 8734.03] + - [657, 8734.03] - - [2048, 512, 1, 5632] - - [588, 8758.98] + - [660, 8758.98] - - [4096, 1024, 1, 4608] - - [480, 9657.22] + - [552, 9657.22] - - [4096, 1024, 1, 3328] - - [480, 9621.45] + - [552, 9621.45] - - [4096, 256, 1, 7168] - - [574, 8770.05] + - [646, 8770.05] - - [4096, 200, 1, 128] - - [588, 4458.33] + - [660, 4458.33] - - [2048, 200, 1, 5120] - - [572, 6176.91] + - [644, 6176.91] - - [1024, 1024, 1, 6656] - - [574, 8780.45] + - [646, 8780.45] - - [512, 1024, 1, 3200] - - [583, 7887.09] + - [655, 7887.09] - - [512, 200, 1, 2304] - - [473, 2991.09] + - [545, 2991.09] - - [2048, 1024, 1, 9216] - - [485, 9325.46] + - [557, 9325.46] - - [2048, 256, 1, 1536] - - [583, 7551.73] + - [655, 7551.73] - - [4096, 256, 1, 256] - - [588, 6932.83] + - [660, 6932.83] - - [2048, 512, 1, 1408] - - [585, 8430.86] + - [657, 8430.86] - - [1024, 256, 1, 384] - - [577, 4462.13] + - [649, 4462.13] - - [2048, 1024, 1, 2304] - - [480, 9174.94] + - [552, 9174.94] - - [4096, 512, 1, 6144] - - [482, 9284.25] + - [554, 9284.25] - - [1024, 200, 1, 14336] - - [471, 5268.57] + - [543, 5268.57] - - [1024, 512, 1, 2080] - - [583, 7736.47] + - [655, 7736.47] - - [2048, 512, 1, 2304] - - [585, 8616.07] + - [657, 8616.07] - - [4096, 512, 1, 15360] - - [485, 9362.17] + - [557, 9362.17] - - [1024, 256, 1, 32] - - [505, 1028.12] + - [577, 1028.12] - - [1024, 200, 1, 2816] - - [583, 4780.58] + - [655, 4780.58] - - [4096, 200, 1, 512] - - [574, 6054.23] + - [646, 6054.23] - - [4096, 1024, 1, 7168] - - [485, 9468.49] + - [557, 9468.49] - - [2048, 256, 1, 14336] - - [546, 7865.52] + - [618, 7865.52] - - [1024, 200, 1, 3072] - - [583, 4804.2] + - [655, 4804.2] - - [2048, 200, 1, 1280] - - [583, 5846.31] + - [655, 5846.31] - - [1024, 1024, 1, 2304] - - [574, 8633.32] + - [646, 8633.32] - - [4096, 1024, 1, 9216] - - [480, 9641.03] + - [552, 9641.03] - - [2048, 512, 1, 4608] - - [585, 8743.3] + - [657, 8743.3] - - [4096, 1024, 1, 7680] - - [480, 9684.86] + - [552, 9684.86] - - [4096, 256, 1, 6144] - - [585, 8757.24] + - [657, 8757.24] - - [4096, 256, 1, 896] - - [578, 8258.93] + - [650, 8258.93] - - [512, 256, 1, 1536] - - [562, 3065.36] + - [634, 3065.36] - - [1024, 256, 1, 512] - - [572, 4752.85] + - [644, 4752.85] - - [2048, 256, 1, 640] - - [572, 6776.04] + - [644, 6776.04] - - [256, 256, 1, 2048] - - [509, 2249.06] + - [581, 2249.06] - - [2048, 1024, 1, 8192] - - [480, 9178.17] + - [552, 9178.17] - - [4096, 200, 1, 16640] - - [472, 7009.59] + - [544, 7009.59] - - [256, 512, 1, 512] - - [484, 2511.66] + - [556, 2511.66] - - [2048, 512, 1, 384] - - [585, 7467.7] + - [657, 7467.7] - - [2048, 200, 1, 16384] - - [553, 6327.31] + - [625, 6327.31] - - [4096, 200, 1, 10240] - - [578, 6892.74] + - [650, 6892.74] - - [1024, 512, 1, 9216] - - [527, 7530.09] + - [599, 7530.09] - - [4096, 1024, 1, 64] - - [502, 6260.26] + - [574, 6260.26] - - [4096, 200, 1, 1920] - - [588, 6710.27] + - [660, 6710.27] - - [2048, 1024, 1, 1280] - - [480, 8998.34] + - [552, 8998.34] - - [1024, 200, 1, 3840] - - [572, 4873.87] + - [644, 4873.87] - - [256, 1024, 1, 512] - - [583, 4766.35] + - [655, 4766.35] - - [2048, 1024, 1, 3328] - - [480, 9275.2] + - [552, 9275.2] - - [1024, 256, 1, 16640] - - [537, 6837.22] + - [609, 6837.22] - - [4096, 512, 1, 14336] - - [485, 9354.42] + - [557, 9354.42] - - [1024, 1024, 1, 16640] - - [582, 8832.37] + - [654, 8832.37] - - [1024, 256, 1, 1152] - - [583, 5642.66] + - [655, 5642.66] - - [512, 512, 1, 512] - - [572, 4779.93] + - [644, 4779.93] - - [4096, 512, 1, 8320] - - [485, 9327.96] + - [557, 9327.96] - - [2048, 512, 1, 7680] - - [588, 8793.96] + - [660, 8793.96] - - [4096, 1024, 1, 6656] - - [480, 9667.03] + - [552, 9667.03] - - [1024, 512, 1, 3584] - - [583, 7900.57] + - [655, 7900.57] - - [1024, 1024, 1, 32] - - [568, 2974.78] + - [640, 2974.78] - - [512, 512, 1, 2816] - - [564, 6155.85] + - [636, 6155.85] - - [2048, 512, 1, 1664] - - [588, 8496.55] + - [660, 8496.55] - - [1024, 1024, 1, 14336] - - [474, 8624.74] + - [546, 8624.74] - - [2048, 200, 1, 2048] - - [583, 6029.86] + - [655, 6029.86] - - [1024, 1024, 1, 3584] - - [574, 8702.62] + - [646, 8702.62] - - [512, 200, 1, 1280] - - [488, 2350.75] + - [560, 2350.75] - - [4096, 256, 1, 6656] - - [588, 8788.41] + - [660, 8788.41] - - [4096, 256, 1, 4160] - - [565, 8728.44] + - [637, 8728.44] - - [128, 256, 1, 1024] - - [547, 859.589] + - [619, 859.589] - - [512, 200, 1, 3200] - - [488, 3376.85] + - [560, 3376.85] - - [2048, 512, 1, 9216] - - [571, 8806.4] + - [643, 8806.4] - - [2048, 1024, 1, 256] - - [567, 7713.76] + - [639, 7713.76] - - [1024, 256, 1, 2304] - - [583, 6015.83] + - [655, 6015.83] - - [1024, 200, 1, 8192] - - [583, 5022.02] + - [655, 5022.02] - - [2048, 256, 1, 3072] - - [500, 7515.09] + - [572, 7515.09] - - [2048, 256, 1, 8320] - - [572, 8063.68] + - [644, 8063.68] - - [4096, 512, 1, 1024] - - [482, 8824.41] + - [554, 8824.41] - - [1024, 512, 1, 3200] - - [572, 7866.39] + - [644, 7866.39] - - [1024, 512, 1, 896] - - [564, 7161.11] + - [636, 7161.11] - - [2048, 512, 1, 1280] - - [578, 8384.52] + - [650, 8384.52] - - [4096, 200, 1, 64] - - [487, 3260.6] + - [559, 3260.6] - - [1024, 256, 1, 6144] - - [593, 6143.72] + - [665, 6143.72] - - [1024, 200, 1, 2560] - - [572, 4762.89] + - [644, 4762.89] - - [1024, 1024, 1, 5120] - - [501, 8454.23] + - [573, 8454.23] - - [2048, 512, 1, 6656] - - [578, 8799.05] + - [650, 8799.05] - - [4096, 1024, 1, 1536] - - [480, 9503.37] + - [552, 9503.37] - - [1024, 1024, 1, 128] - - [503, 5825.52] + - [575, 5825.52] - - [512, 1024, 1, 1792] - - [572, 7701.12] + - [644, 7701.12] - - [2048, 1024, 1, 32] - - [483, 3938.41] + - [555, 3938.41] - - [4096, 256, 1, 2816] - - [567, 8652.2] + - [639, 8652.2] - - [1024, 1024, 1, 15360] - - [474, 8719.7] + - [546, 8719.7] - - [1024, 256, 1, 5632] - - [572, 6344.18] + - [644, 6344.18] - - [1024, 1024, 1, 4096] - - [575, 8187.86] + - [647, 8187.86] - - [2048, 200, 1, 4160] - - [583, 6222.48] + - [655, 6222.48] - - [512, 256, 1, 768] - - [514, 2771.67] + - [586, 2771.67] - - [4096, 512, 1, 640] - - [485, 8590.58] + - [557, 8590.58] - - [2048, 512, 1, 8192] - - [527, 8494.9] + - [599, 8494.9] - - [1024, 512, 1, 768] - - [572, 7049.35] + - [644, 7049.35] - - [4096, 200, 1, 8320] - - [567, 6908.7] + - [639, 6908.7] - - [2048, 512, 1, 896] - - [574, 8224.23] + - [646, 8224.23] - - [4096, 200, 1, 7168] - - [585, 6878.59] + - [657, 6878.59] - - [2048, 512, 1, 13312] - - [573, 8803.04] + - [645, 8803.04] - - [64, 512, 1, 1024] - - [477, 844.024] + - [549, 844.024] - - [2048, 200, 1, 3840] - - [572, 6192.48] + - [644, 6192.48] - - [1024, 1024, 1, 768] - - [565, 8098.51] + - [637, 8098.51] - - [4096, 512, 1, 16384] - - [485, 9345.73] + - [557, 9345.73] - - [4096, 256, 1, 2304] - - [565, 8596.45] + - [637, 8596.45] - - [1, 256, 1, 4096] - - [570, 19.9293] + - [642, 19.9293] - - [1024, 1024, 1, 11264] - - [575, 8491.48] + - [647, 8491.48] - - [2048, 200, 1, 16640] - - [569, 6510.64] + - [641, 6510.64] - - [1024, 256, 1, 3072] - - [583, 6179.55] + - [655, 6179.55] - - [4096, 1024, 1, 512] - - [480, 9032.25] + - [552, 9032.25] - - [2048, 256, 1, 2816] - - [572, 7793.57] + - [644, 7793.57] - - [32, 512, 1, 512] - - [484, 318.816] + - [556, 318.816] - - [256, 512, 1, 2048] - - [535, 3369.02] + - [607, 3369.02] - - [1024, 512, 1, 384] - - [583, 6198.58] + - [655, 6198.58] - - [2048, 200, 1, 7680] - - [572, 6307.7] + - [644, 6307.7] - - [1024, 512, 1, 4608] - - [583, 7953.48] - - - [2048, 256, 1, 768] - - [583, 7059.24] + - [655, 7953.48] - - [4096, 200, 1, 32] - - [532, 2199.29] + - [604, 2199.29] - - [4096, 200, 1, 3328] - - [567, 6813.12] + - [639, 6813.12] - - [1024, 200, 1, 1152] - - [572, 4375.65] + - [644, 4375.65] - - [1024, 1024, 1, 1408] - - [574, 8457.91] + - [646, 8457.91] - - [2048, 200, 1, 15360] - - [548, 6333.1] + - [620, 6333.1] - - [512, 1024, 1, 2048] - - [558, 6280.76] + - [630, 6280.76] - - [1024, 512, 1, 1024] - - [583, 7064.19] + - [655, 7064.19] - - [1024, 200, 1, 10240] - - [572, 5030.69] + - [644, 5030.69] - - [4096, 256, 1, 5632] - - [585, 8765.22] + - [657, 8765.22] - - [512, 512, 1, 3072] - - [595, 5942.44] + - [667, 5942.44] - - [2048, 256, 1, 1408] - - [572, 7545.05] + - [644, 7545.05] - - [2048, 256, 1, 6144] - - [583, 7963.97] + - [655, 7963.97] - - [4096, 256, 1, 3328] - - [578, 8682.58] + - [650, 8682.58] - - [1024, 200, 1, 1664] - - [572, 4595.4] + - [644, 4595.4] - - [2048, 1024, 1, 1152] - - [480, 8942.65] + - [552, 8942.65] - - [2048, 512, 1, 6144] - - [573, 8729.71] + - [645, 8729.71] - - [2048, 512, 1, 3200] - - [574, 8696.56] + - [646, 8696.56] - - [4096, 1024, 1, 2080] - - [513, 9538.45] + - [585, 9538.45] - - [4096, 1024, 1, 768] - - [480, 9260.75] + - [552, 9260.75] - - [4096, 1024, 1, 2560] - - [480, 9567.27] + - [552, 9567.27] - - [64, 200, 1, 2048] - - [512, 583.161] + - [584, 583.161] - - [2048, 200, 1, 4608] - - [583, 6243.28] + - [655, 6243.28] - - [1024, 1024, 1, 6144] - - [575, 8320.25] + - [647, 8320.25] - - [4096, 256, 1, 1664] - - [578, 8503.17] + - [650, 8503.17] - - [2048, 200, 1, 384] - - [583, 4940.0] + - [655, 4940.0] - - [1, 200, 1, 2048] - - [529, 11.3281] + - [601, 11.3281] - - [4096, 256, 1, 1792] - - [588, 8504.12] + - [660, 8504.12] - - [2048, 1024, 1, 64] - - [502, 5309.35] + - [574, 5309.35] - - [4096, 1024, 1, 16384] - - [469, 9428.61] + - [541, 9428.61] - - [1024, 512, 1, 16640] - - [583, 8122.55] + - [655, 8122.55] - - [2048, 512, 1, 10240] - - [573, 8766.21] + - [645, 8766.21] - - [4096, 512, 1, 6656] - - [480, 9351.75] + - [552, 9351.75] - - [2048, 256, 1, 16640] - - [572, 8135.27] + - [644, 8135.27] - - [2048, 512, 1, 2816] - - [574, 8660.32] + - [646, 8660.32] - - [1024, 200, 1, 32] - - [492, 780.291] + - [564, 780.291] - - [1, 512, 1, 4096] - - [517, 34.8671] + - [589, 34.8671] - - [256, 256, 1, 1024] - - [524, 1490.08] + - [596, 1490.08] - - [2048, 1024, 1, 128] - - [497, 6605.3] + - [569, 6605.3] - - [2048, 1024, 1, 2080] - - [480, 9159.51] + - [552, 9159.51] - - [2048, 1024, 1, 16640] - - [480, 9371.65] + - [552, 9371.65] - - [1024, 200, 1, 384] - - [583, 3378.24] + - [655, 3378.24] - - [4096, 256, 1, 384] - - [528, 7369.3] + - [600, 7369.3] - - [4096, 256, 1, 13312] - - [582, 8776.48] + - [654, 8776.48] - - [2048, 256, 1, 128] - - [577, 4280.0] + - [649, 4280.0] - - [512, 256, 1, 2304] - - [489, 3584.98] + - [561, 3584.98] - - [2048, 1024, 1, 3072] - - [482, 9156.52] + - [554, 9156.52] - - [1024, 1024, 1, 640] - - [578, 7928.84] + - [650, 7928.84] - - [256, 512, 1, 1024] - - [583, 2843.7] + - [655, 2843.7] - - [4096, 1024, 1, 1408] - - [480, 9437.56] + - [552, 9437.56] - - [4096, 200, 1, 5632] - - [585, 6873.96] + - [657, 6873.96] - - [4096, 1024, 1, 2048] - - [480, 9437.1] + - [552, 9437.1] - - [2048, 1024, 1, 2560] - - [485, 9195.62] + - [557, 9195.62] - - [4096, 1024, 1, 128] - - [567, 7407.26] + - [639, 7407.26] - - [1024, 200, 1, 3328] - - [583, 4857.39] + - [655, 4857.39] - - [2048, 200, 1, 1152] - - [572, 5760.1] + - [644, 5760.1] - - [1024, 200, 1, 9216] - - [471, 5053.21] + - [543, 5053.21] - - [4096, 256, 1, 512] - - [565, 7617.45] + - [637, 7617.45] - - [4096, 1024, 1, 14336] - - [480, 9665.12] + - [552, 9665.12] - - [1024, 1024, 1, 384] - - [503, 7478.8] + - [575, 7478.8] - - [2048, 200, 1, 512] - - [572, 5150.28] + - [644, 5150.28] - - [2048, 256, 1, 9216] - - [551, 7717.71] + - [623, 7717.71] - - [2048, 256, 1, 1792] - - [572, 7655.94] + - [644, 7655.94] - - [4096, 512, 1, 9216] - - [482, 9331.22] + - [554, 9331.22] - - [4096, 200, 1, 15360] - - [472, 6958.14] + - [544, 6958.14] - - [1024, 512, 1, 2048] - - [571, 7067.91] + - [643, 7067.91] - - [64, 256, 1, 2048] - - [496, 723.256] + - [568, 723.256] - - [4096, 200, 1, 1792] - - [574, 6699.65] + - [646, 6699.65] - - [1, 200, 1, 4096] - - [506, 15.6387] + - [578, 15.6387] - - [2048, 1024, 1, 2048] - - [485, 9071.93] + - [557, 9071.93] - - [1024, 200, 1, 2080] - - [564, 4679.19] + - [636, 4679.19] - - [2048, 200, 1, 1536] - - [583, 5939.92] + - [655, 5939.92] - - [1024, 1024, 1, 3072] - - [545, 8333.15] + - [617, 8333.15] - - [512, 200, 1, 1792] - - [470, 2679.73] + - [542, 2679.73] - - [1024, 256, 1, 11264] - - [473, 6470.98] + - [545, 6470.98] - - [2048, 512, 1, 12288] - - [520, 8729.24] + - [592, 8729.24] - - [1024, 256, 1, 1792] - - [583, 5931.44] + - [655, 5931.44] - - [1024, 200, 1, 7168] - - [583, 4970.33] + - [655, 4970.33] - - [32, 256, 1, 1024] - - [494, 237.334] + - [566, 237.334] - - [512, 256, 1, 3072] - - [537, 3813.1] + - [609, 3813.1] - - [1024, 1024, 1, 2080] - - [574, 8600.41] + - [646, 8600.41] - - [2048, 200, 1, 2304] - - [583, 6093.32] + - [655, 6093.32] - - [4096, 512, 1, 1536] - - [480, 9075.0] + - [552, 9075.0] - - [2048, 256, 1, 7168] - - [583, 7895.26] + - [655, 7895.26] - - [2048, 512, 1, 1792] - - [585, 8531.92] + - [657, 8531.92] - - [1024, 200, 1, 2048] - - [572, 4685.43] + - [644, 4685.43] - - [1024, 1024, 1, 4608] - - [578, 8735.71] + - [650, 8735.71] - - [4096, 256, 1, 8192] - - [574, 8782.55] + - [646, 8782.55] - - [512, 1024, 1, 1280] - - [564, 7483.25] + - [636, 7483.25] - - [2048, 1024, 1, 16384] - - [474, 8878.96] + - [546, 8878.96] - - [512, 512, 1, 1280] - - [572, 5745.72] + - [644, 5745.72] - - [1024, 200, 1, 1280] - - [564, 4446.23] + - [636, 4446.23] - - [4096, 512, 1, 4096] - - [482, 9264.49] + - [554, 9264.49] - - [2048, 256, 1, 3200] - - [572, 7842.85] + - [644, 7842.85] - - [2048, 512, 1, 15360] - - [520, 8757.24] + - [592, 8757.24] - - [1024, 512, 1, 3328] - - [572, 7854.04] + - [644, 7854.04] - - [1024, 512, 1, 4160] - - [572, 7934.61] + - [644, 7934.61] - - [4096, 200, 1, 6656] - - [574, 6883.3] + - [646, 6883.3] - - [4096, 1024, 1, 1024] - - [480, 9229.44] + - [552, 9229.44] - - [2048, 200, 1, 3328] - - [583, 6182.74] + - [655, 6182.74] - - [1024, 1024, 1, 256] - - [503, 6932.83] + - [575, 6932.83] - - [512, 200, 1, 512] - - [524, 1910.77] + - [596, 1910.77] - - [2048, 256, 1, 64] - - [495, 2912.81] + - [567, 2912.81] - - [1024, 256, 1, 2560] - - [572, 6123.17] + - [644, 6123.17] - - [2048, 512, 1, 11264] - - [584, 8728.94] + - [656, 8728.94] - - [32, 200, 1, 1024] - - [579, 187.56] + - [651, 187.56] - - [32, 512, 1, 2048] - - [523, 694.521] + - [595, 694.521] - - [2048, 256, 1, 2304] - - [572, 7759.35] + - [644, 7759.35] - - [2048, 256, 1, 12288] - - [551, 7726.35] + - [623, 7726.35] - - [4096, 200, 1, 8192] - - [574, 6870.94] + - [646, 6870.94] - - [1024, 512, 1, 7168] - - [520, 7479.2] + - [592, 7479.2] - - [1024, 512, 1, 1792] - - [572, 7626.11] + - [644, 7626.11] - - [4096, 1024, 1, 1664] - - [480, 9503.54] + - [552, 9503.54] - - [4096, 200, 1, 2816] - - [567, 6775.44] + - [639, 6775.44] - - [1024, 1024, 1, 896] - - [574, 8229.99] + - [646, 8229.99] - - [1024, 200, 1, 8320] - - [535, 5173.58] + - [607, 5173.58] - - [1024, 1024, 1, 12288] - - [575, 8463.21] + - [647, 8463.21] - - [1024, 256, 1, 8320] - - [564, 6404.37] + - [636, 6404.37] - - [1024, 200, 1, 1024] - - [572, 4297.54] + - [644, 4297.54] - - [1024, 200, 1, 16640] - - [534, 5499.51] + - [606, 5499.51] - - [4096, 256, 1, 5120] - - [588, 8729.15] + - [660, 8729.15] - - [1024, 256, 1, 3200] - - [583, 6124.96] + - [655, 6124.96] - - [512, 512, 1, 2560] - - [583, 6109.79] + - [655, 6109.79] - - [4096, 256, 1, 2048] - - [588, 8511.05] + - [660, 8511.05] - - [1024, 256, 1, 640] - - [572, 5102.66] + - [644, 5102.66] - - [2048, 256, 1, 5120] - - [500, 7667.93] + - [572, 7667.93] - - [2048, 256, 1, 7680] - - [583, 8054.45] + - [655, 8054.45] - - [4096, 512, 1, 384] - - [578, 8190.77] + - [650, 8190.77] - - [2048, 200, 1, 3584] - - [572, 6166.12] + - [644, 6166.12] - - [1024, 512, 1, 1536] - - [572, 7517.9] + - [644, 7517.9] - - [4096, 512, 1, 3328] - - [480, 9259.45] + - [552, 9259.45] - - [4096, 1024, 1, 256] - - [480, 8341.79] + - [552, 8341.79] - - [2048, 200, 1, 64] - - [543, 2307.71] + - [615, 2307.71] - - [2048, 200, 1, 4096] - - [583, 6212.04] + - [655, 6212.04] - - [1024, 1024, 1, 1536] - - [574, 8484.15] + - [646, 8484.15] - - [2048, 1024, 1, 7168] - - [482, 9315.24] + - [554, 9315.24] - - [1024, 256, 1, 3584] - - [572, 6207.32] + - [644, 6207.32] - - [4096, 256, 1, 32] - - [576, 2892.72] + - [648, 2892.72] - - [4096, 256, 1, 1280] - - [585, 8392.9] + - [657, 8392.9] - - [512, 512, 1, 3200] - - [583, 6219.41] + - [655, 6219.41] - - [2048, 1024, 1, 1536] - - [482, 9052.55] + - [554, 9052.55] - - [2048, 256, 1, 1024] - - [572, 7192.9] + - [644, 7192.9] - - [128, 200, 1, 512] - - [562, 502.677] + - [634, 502.677] - - [4096, 512, 1, 7168] - - [485, 9329.11] + - [557, 9329.11] - - [1024, 512, 1, 1152] - - [572, 7358.53] + - [644, 7358.53] - - [64, 1024, 1, 2048] - - [490, 2102.51] + - [562, 2102.51] - - [2048, 512, 1, 3328] - - [574, 8694.69] + - [646, 8694.69] - - [4096, 1024, 1, 896] - - [480, 9343.02] + - [552, 9343.02] - - [1, 1024, 1, 2048] - - [530, 40.9324] + - [602, 40.9324] - - [4096, 200, 1, 3584] - - [578, 6810.3] + - [650, 6810.3] - - [4096, 1024, 1, 4096] - - [480, 9347.56] + - [552, 9347.56] - - [1024, 256, 1, 14336] - - [473, 6625.8] + - [545, 6625.8] - - [2048, 200, 1, 256] - - [572, 4413.3] + - [644, 4413.3] - - [4096, 256, 1, 16384] - - [474, 8752.13] + - [546, 8752.13] - - [4096, 256, 1, 1920] - - [565, 8533.78] + - [637, 8533.78] - - [32, 1024, 1, 512] - - [563, 647.369] + - [635, 647.369] - - [1024, 256, 1, 7680] - - [583, 6387.36] + - [655, 6387.36] - - [2048, 256, 1, 1664] - - [583, 7631.44] + - [655, 7631.44] - - [512, 200, 1, 1536] - - [488, 2576.88] + - [560, 2576.88] - - [2048, 1024, 1, 6144] - - [469, 9033.77] + - [541, 9033.77] - - [512, 256, 1, 2816] - - [535, 3977.46] + - [607, 3977.46] - - [4096, 512, 1, 4160] - - [482, 9289.02] + - [554, 9289.02] - - [4096, 512, 1, 2080] - - [561, 9150.28] + - [633, 9150.28] - - [2048, 256, 1, 15360] - - [546, 7963.97] + - [618, 7963.97] - - [4096, 200, 1, 5120] - - [585, 6861.62] + - [657, 6861.62] - - [1024, 512, 1, 8192] - - [571, 7473.25] + - [643, 7473.25] - - [4096, 200, 1, 896] - - [588, 6443.25] + - [660, 6443.25] - - [2048, 512, 1, 8320] - - [578, 8810.24] + - [650, 8810.24] - - [1024, 1024, 1, 10240] - - [586, 8436.7] + - [658, 8436.7] - - [1024, 200, 1, 768] - - [572, 4087.58] + - [644, 4087.58] - - [2048, 200, 1, 640] - - [583, 5416.3] + - [655, 5416.3] - - [512, 200, 1, 2048] - - [537, 2702.62] + - [609, 2702.62] - - [1024, 1024, 1, 9216] - - [575, 8499.08] + - [647, 8499.08] - - [4096, 200, 1, 1408] - - [585, 6613.82] + - [657, 6613.82] - - [1024, 256, 1, 13312] - - [473, 6643.54] + - [545, 6643.54] - - [1024, 256, 1, 128] - - [504, 2706.1] + - [576, 2706.1] - - [2048, 200, 1, 5632] - - [583, 6270.12] + - [655, 6270.12] - - [64, 1024, 1, 512] - - [562, 1310.82] + - [634, 1310.82] - - [1024, 512, 1, 2560] - - [583, 7731.54] + - [655, 7731.54] - - [4096, 200, 1, 1280] - - [565, 6566.83] + - [637, 6566.83] - - [1024, 200, 1, 4096] - - [583, 4911.46] + - [655, 4911.46] - - [1024, 1024, 1, 2560] - - [574, 8630.35] + - [646, 8630.35] - - [2048, 512, 1, 64] - - [578, 4152.88] + - [650, 4152.88] - - [2048, 200, 1, 8192] - - [572, 6234.21] + - [644, 6234.21] - - [2048, 512, 1, 3072] - - [582, 8614.85] + - [654, 8614.85] - - [4096, 1024, 1, 5120] - - [480, 9573.75] + - [552, 9573.75] - - [4096, 256, 1, 640] - - [567, 7913.88] + - [639, 7913.88] - - [1024, 256, 1, 1280] - - [572, 5706.64] + - [644, 5706.64] - - [2048, 1024, 1, 1920] - - [482, 9141.34] + - [554, 9141.34] - - [2048, 256, 1, 4096] - - [572, 7937.28] + - [644, 7937.28] - - [2048, 1024, 1, 15360] - - [485, 9351.96] + - [557, 9351.96] - - [4096, 200, 1, 16384] - - [474, 6975.21] + - [546, 6975.21] - - [1, 1024, 1, 4096] - - [592, 60.7815] + - [664, 60.7815] - - [4096, 1024, 1, 2816] - - [480, 9583.98] + - [552, 9583.98] - - [4096, 200, 1, 1664] - - [567, 6658.7] + - [639, 6658.7] - - [4096, 512, 1, 256] - - [498, 7731.54] + - [570, 7731.54] - - [1024, 200, 1, 896] - - [572, 4193.45] + - [644, 4193.45] - - [2048, 200, 1, 6656] - - [583, 6291.17] + - [655, 6291.17] - - [2048, 1024, 1, 5120] - - [482, 9270.57] + - [554, 9270.57] - - [512, 1024, 1, 768] - - [572, 7099.06] + - [644, 7099.06] - - [2048, 512, 1, 14336] - - [552, 8559.13] + - [624, 8559.13] - - [2048, 200, 1, 8320] - - [572, 6314.72] + - [644, 6314.72] - - [4096, 256, 1, 3840] - - [588, 8718.56] + - [660, 8718.56] - - [2048, 1024, 1, 4096] - - [469, 8973.38] + - [541, 8973.38] - - [1024, 1024, 1, 3200] - - [578, 8701.98] + - [650, 8701.98] - - [1024, 256, 1, 4608] - - [572, 6268.05] + - [644, 6268.05] - - [4096, 512, 1, 4608] - - [480, 9316.47] + - [552, 9316.47] - - [2048, 512, 1, 2048] - - [571, 8462.76] + - [643, 8462.76] - - [4096, 512, 1, 1664] - - [480, 9074.53] + - [552, 9074.53] - - [4096, 256, 1, 4608] - - [567, 8718.05] + - [639, 8718.05] - - [1024, 512, 1, 32] - - [560, 1807.99] + - [632, 1807.99] - - [1024, 512, 1, 3840] - - [572, 7936.34] + - [644, 7936.34] - - [2048, 512, 1, 1920] - - [588, 8548.27] + - [660, 8548.27] - - [2048, 1024, 1, 896] - - [480, 8843.51] + - [552, 8843.51] - - [4096, 200, 1, 6144] - - [588, 6864.76] + - [660, 6864.76] - - [1024, 512, 1, 13312] - - [541, 7763.19] + - [613, 7763.19] - - [4096, 1024, 1, 4160] - - [480, 9650.72] + - [552, 9650.72] - - [2048, 200, 1, 2816] - - [572, 6119.76] + - [644, 6119.76] - - [1024, 1024, 1, 3840] - - [567, 8709.5] + - [639, 8709.5] - - [128, 1024, 1, 1024] - - [590, 2577.25] + - [662, 2577.25] - - [2048, 1024, 1, 11264] - - [485, 9339.06] + - [557, 9339.06] - - [2048, 1024, 1, 384] - - [574, 8210.81] + - [646, 8210.81] - - [1024, 256, 1, 2048] - - [595, 5755.58] + - [667, 5755.58] - - [2048, 1024, 1, 3840] - - [482, 9288.96] + - [554, 9288.96] - - [4096, 256, 1, 8320] - - [588, 8812.38] + - [660, 8812.38] - - [2048, 256, 1, 3840] - - [564, 7857.05] + - [636, 7857.05] - - [64, 256, 1, 512] - - [562, 336.182] + - [634, 336.182] - - [4096, 512, 1, 1280] - - [482, 8993.52] + - [554, 8993.52] - - [512, 256, 1, 1280] - - [514, 2996.03] + - [586, 2996.03] - - [1024, 512, 1, 7680] - - [572, 8041.59] + - [644, 8041.59] - - [4096, 1024, 1, 1152] - - [480, 9368.48] + - [552, 9368.48] - - [256, 200, 1, 512] - - [514, 993.07] + - [586, 993.07] - - [256, 1024, 1, 2048] - - [591, 4759.59] + - [663, 4759.59] - - [2048, 200, 1, 10240] - - [583, 6329.03] + - [655, 6329.03] - - [2048, 512, 1, 5120] - - [584, 8732.56] + - [656, 8732.56] - - [2048, 1024, 1, 1408] - - [482, 9006.9] + - [554, 9006.9] - - [512, 1024, 1, 512] - - [572, 6528.2] + - [644, 6528.2] - - [1024, 200, 1, 11264] - - [539, 5194.82] + - [611, 5194.82] - - [512, 1024, 1, 1024] - - [525, 6337.1] + - [597, 6337.1] - - [2048, 512, 1, 32] - - [491, 2777.78] + - [563, 2777.78] - - [4096, 256, 1, 2560] - - [574, 8621.49] + - [646, 8621.49] - - [4096, 256, 1, 64] - - [508, 4194.4] + - [580, 4194.4] - - [32, 1024, 1, 1024] - - [509, 778.264] + - [581, 778.264] - - [2048, 200, 1, 768] - - [583, 5507.33] + - [655, 5507.33] - - [512, 512, 1, 2048] - - [531, 5338.91] + - [603, 5338.91] - - [2048, 512, 1, 2560] - - [585, 8643.69] + - [657, 8643.69] - - [512, 256, 1, 512] - - [564, 2542.1] + - [636, 2542.1] - - [1024, 200, 1, 7680] - - [539, 5047.8] + - [611, 5047.8] - - [4096, 512, 1, 896] - - [480, 8856.85] + - [552, 8856.85] - - [4096, 1024, 1, 3072] - - [480, 9492.17] + - [552, 9492.17] - - [4096, 200, 1, 13312] - - [472, 6900.73] + - [544, 6900.73] - - [2048, 512, 1, 7168] - - [573, 8788.1] + - [645, 8788.1] - - [2048, 1024, 1, 2816] - - [485, 9229.88] + - [557, 9229.88] - - [2048, 512, 1, 128] - - [503, 5630.04] + - [575, 5630.04] - - [1024, 256, 1, 8192] - - [595, 6203.83] + - [667, 6203.83] - - [4096, 1024, 1, 1792] - - [480, 9510.42] + - [552, 9510.42] - - [1024, 200, 1, 6656] - - [564, 5002.85] + - [636, 5002.85] - - [1024, 1024, 1, 1024] - - [501, 8095.26] + - [573, 8095.26] - - [4096, 200, 1, 2304] - - [585, 6754.45] + - [657, 6754.45] - - [4096, 512, 1, 1152] - - [480, 8974.54] + - [552, 8974.54] - - [512, 200, 1, 1024] - - [562, 2233.01] + - [634, 2233.01] - - [1024, 256, 1, 3840] - - [583, 6244.72] + - [655, 6244.72] - - [512, 512, 1, 768] - - [572, 5331.84] + - [644, 5331.84] - - [2048, 512, 1, 4096] - - [582, 8621.76] + - [654, 8621.76] - - [2048, 256, 1, 2560] - - [572, 7770.93] + - [644, 7770.93] - - [2048, 256, 1, 4160] - - [583, 7923.08] + - [655, 7923.08] - - [1024, 256, 1, 64] - - [479, 1705.1] + - [551, 1705.1] - - [4096, 512, 1, 7680] - - [480, 9364.57] + - [552, 9364.57] - - [1024, 512, 1, 1664] - - [583, 7594.24] + - [655, 7594.24] - - [2048, 512, 1, 2080] - - [574, 8570.67] + - [646, 8570.67] - - [2048, 512, 1, 3840] - - [585, 8729.14] + - [657, 8729.14] - - [4096, 1024, 1, 384] - - [480, 8764.86] + - [552, 8764.86] - - [4096, 200, 1, 3072] - - [574, 6772.39] + - [646, 6772.39] - - [1024, 512, 1, 14336] - - [542, 7680.97] + - [614, 7680.97] - - [1024, 200, 1, 1920] - - [564, 4637.08] + - [636, 4637.08] - - [1024, 1024, 1, 1664] - - [578, 8506.49] + - [650, 8506.49] - - [512, 1024, 1, 2304] - - [572, 7775.33] + - [644, 7775.33] - - [2048, 1024, 1, 1792] - - [480, 9123.46] + - [552, 9123.46] - - [32, 200, 1, 512] - - [580, 125.744] + - [652, 125.744] - - [4096, 256, 1, 11264] - - [585, 8822.31] + - [657, 8822.31] - - [4096, 256, 1, 1408] - - [585, 8419.32] + - [657, 8419.32] - - [1024, 256, 1, 7168] - - [572, 6377.54] + - [644, 6377.54] - - [2048, 256, 1, 1152] - - [583, 7401.81] + - [655, 7401.81] - - [256, 256, 1, 512] - - [562, 1314.93] + - [634, 1314.93] - - [1024, 512, 1, 1280] - - [572, 7410.53] + - [644, 7410.53] - - [512, 512, 1, 1792] - - [564, 5931.44] + - [636, 5931.44] - - [2048, 200, 1, 12288] - - [546, 6242.25] + - [618, 6242.25] - - [2048, 200, 1, 1664] - - [583, 5953.75] + - [655, 5953.75] - - [4096, 200, 1, 4608] - - [578, 6853.54] + - [650, 6853.54] - - [512, 1024, 1, 2560] - - [572, 7778.13] + - [644, 7778.13] - - [4096, 200, 1, 384] - - [565, 5765.73] + - [637, 5765.73] - - [128, 512, 1, 512] - - [562, 1302.68] + - [634, 1302.68] - - [1024, 200, 1, 256] - - [566, 2861.93] + - [638, 2861.93] - - [256, 1024, 1, 1024] - - [507, 4522.26] + - [579, 4522.26] - - [2048, 200, 1, 128] - - [572, 3310.0] + - [644, 3310.0] - - [2048, 200, 1, 11264] - - [553, 6168.2] + - [625, 6168.2] - - [1024, 512, 1, 1920] - - [583, 7649.29] + - [655, 7649.29] - - [4096, 256, 1, 1536] - - [578, 8427.33] + - [650, 8427.33] - - [4096, 1024, 1, 3584] - - [480, 9618.0] + - [552, 9618.0] - - [2048, 256, 1, 256] - - [572, 5464.99] + - [644, 5464.99] - - [2048, 1024, 1, 768] - - [480, 8726.87] + - [552, 8726.87] - - [4096, 256, 1, 10240] - - [574, 8790.89] + - [646, 8790.89] - - [2048, 256, 1, 10240] - - [554, 7665.31] + - [626, 7665.31] - - [4096, 200, 1, 14336] - - [588, 6916.18] + - [660, 6916.18] - - [1024, 512, 1, 5120] - - [526, 7420.36] + - [598, 7420.36] - - [1024, 512, 1, 8320] - - [583, 8061.31] + - [655, 8061.31] - - [256, 200, 1, 2048] - - [538, 1916.36] + - [610, 1916.36] - - [1024, 200, 1, 640] - - [566, 3873.39] + - [638, 3873.39] - - [1024, 512, 1, 10240] - - [571, 7526.9] + - [643, 7526.9] - - [1024, 200, 1, 4160] - - [583, 4928.19] + - [655, 4928.19] - - [1024, 200, 1, 5632] - - [564, 4978.66] + - [636, 4978.66] - - [1024, 1024, 1, 2048] - - [519, 7937.28] + - [591, 7937.28] - - [1024, 256, 1, 6656] - - [583, 6373.68] + - [655, 6373.68] - - [2048, 1024, 1, 8320] - - [480, 9333.15] + - [552, 9333.15] - - [1024, 256, 1, 10240] - - [572, 6407.29] + - [644, 6407.29] - - [2048, 256, 1, 2080] - - [572, 7714.58] + - [644, 7714.58] - - [4096, 256, 1, 128] - - [486, 5765.47] + - [558, 5765.47] - - [1024, 256, 1, 768] - - [577, 5210.42] + - [649, 5210.42] - - [2048, 256, 1, 896] - - [583, 7267.46] + - [655, 7267.46] - - [64, 512, 1, 2048] - - [549, 1296.64] + - [621, 1296.64] - - [4096, 512, 1, 2048] - - [482, 9121.25] + - [554, 9121.25] - - [512, 256, 1, 2048] - - [535, 3283.31] + - [607, 3283.31] - - [4096, 256, 1, 16640] - - [567, 8839.88] + - [639, 8839.88] - - [4096, 512, 1, 2560] - - [485, 9222.15] + - [557, 9222.15] - - [1024, 512, 1, 15360] - - [536, 7865.66] + - [608, 7865.66] - - [4096, 1024, 1, 2304] - - [480, 9558.26] + - [552, 9558.26] - - [4096, 200, 1, 1152] - - [585, 6531.93] + - [657, 6531.93] - - [2048, 200, 1, 6144] - - [583, 6277.75] + - [655, 6277.75] - - [1024, 1024, 1, 7680] - - [578, 8799.34] + - [650, 8799.34] - - [2048, 200, 1, 1920] - - [583, 6031.02] + - [655, 6031.02] - - [32, 1024, 1, 2048] - - [557, 1174.98] + - [629, 1174.98] - - [1024, 200, 1, 3584] - - [564, 4880.44] + - [636, 4880.44] - - [4096, 256, 1, 2080] - - [571, 8557.22] + - [643, 8557.22] - - [1024, 1024, 1, 16384] - - [472, 8618.65] + - [544, 8618.65] - - [1024, 256, 1, 1408] - - [583, 5803.54] + - [655, 5803.54] - - [1024, 256, 1, 4096] - - [593, 6037.78] + - [665, 6037.78] - - [2048, 200, 1, 14336] - - [583, 6364.48] + - [655, 6364.48] - - [4096, 512, 1, 5120] - - [482, 9302.05] + - [554, 9302.05] - - [1024, 512, 1, 6144] - - [518, 7469.09] + - [590, 7469.09] - - [1024, 512, 1, 2304] - - [583, 7759.35] + - [655, 7759.35] - - [4096, 200, 1, 4160] - - [567, 6843.22] + - [639, 6843.22] - - [4096, 200, 1, 1536] - - [578, 6628.27] + - [650, 6628.27] - - [4096, 1024, 1, 6144] - - [480, 9593.08] + - [552, 9593.08] - - [1280, 384, 1, 64] - - [611, 3196.98] + - [683, 3196.98] - - [256, 64, 1, 1225] - - [612, 1194.77] + - [684, 1194.77] - - [2048, 320, 1, 64] - - [614, 3449.36] + - [686, 3449.36] - - [256, 48, 1, 1225] - - [605, 913.498] + - [677, 913.498] - - [2048, 192, 1, 64] - - [604, 2516.68] + - [676, 2516.68] - - [1024, 128, 1, 289] - - [618, 2869.78] + - [690, 2869.78] - - [1280, 192, 1, 64] - - [597, 1872.56] + - [669, 1872.56] - - [192, 32, 1, 1225] - - [602, 505.906] + - [674, 505.906] - - [1280, 448, 1, 64] - - [598, 3078.97] + - [670, 3078.97] - - [384, 64, 1, 1225] - - [603, 1511.43] + - [675, 1511.43] - - [2048, 384, 1, 64] - - [616, 3836.35] + - [688, 3836.35] - - [288, 48, 1, 1225] - - [599, 1032.69] + - [671, 1032.69] - - [64, 80, 1, 5329] - - [615, 888.267] + - [687, 888.267] - - [1024, 384, 1, 289] - - [609, 4291.62] + - [681, 4291.62] - - [2048, 448, 1, 64] - - [608, 3783.62] + - [680, 3783.62] - - [1280, 320, 1, 64] - - [614, 2777.05] + - [686, 2777.05] - - [192, 64, 1, 1225] - - [599, 926.997] + - [671, 926.997] - - [384, 192, 1, 1225] - - [610, 2560.1] + - [682, 2560.1] - - [1536, 256, 1, 64] - - [617, 2621.54] + - [689, 2621.54] - - [192, 48, 1, 1225] - - [602, 698.714] + - [674, 698.714] - - [768, 128, 1, 289] - - [619, 2291.22] + - [691, 2291.22] - - [1024, 256, 1, 289] - - [617, 4064.46] + - [689, 4064.46] - - [768, 192, 1, 289] - - [613, 2690.43] + - [685, 2690.43] - - [1536, 384, 1, 64] - - [600, 3145.83] + - [672, 3145.83] - - [288, 64, 1, 1225] - - [602, 1142.77] + - [674, 1142.77] - - [1024, 192, 1, 289] - - [607, 3243.23] + - [679, 3243.23] - - [384, 96, 1, 1225] - - [620, 1844.81] + - [692, 1844.81] - - [160, 64, 1, 5329] - - [606, 1564.58] + - [678, 1564.58] - - [768, 160, 1, 289] - - [601, 2386.68] + - [673, 2386.68] - - [1024, 3392, 1, 4096] - - [646, 8503.02] + - [718, 8503.02] - - [1024, 3301, 1, 4096] - - [648, 8414.1] + - [720, 8414.1] - - [1024, 3443, 1, 4096] - - [635, 8536.59] + - [707, 8536.59] - - [132, 134, 480, 64] - - [673, 4149.27] + - [745, 4149.27] - - [162, 162, 400, 64] - - [661, 5539.73] + - [733, 5539.73] - - [4096, 3548, 1, 1024] - - [627, 9773.01] + - [699, 9773.01] - - [4096, 2977, 1, 1024] - - [628, 9574.43] + - [700, 9574.43] - - [132, 135, 480, 64] - - [673, 4167.51] + - [745, 4167.51] - - [1024, 2985, 1, 4096] - - [631, 9133.99] + - [703, 9133.99] - - [33708, 3681, 1, 1024] - - [628, 10033.8] + - [700, 10033.8] - - [4096, 3443, 1, 1024] - - [628, 9513.78] + - [700, 9513.78] - - [11, 11, 5456, 64] - - [670, 627.346] + - [742, 627.346] - - [1024, 3400, 1, 4096] - - [649, 8420.02] + - [721, 8420.02] - - [4096, 3995, 1, 1024] - - [627, 9693.87] + - [699, 9693.87] - - [4096, 3190, 1, 1024] - - [627, 9474.84] + - [699, 9474.84] - - [4096, 3594, 1, 1024] - - [628, 9315.83] + - [700, 9315.83] - - [159, 162, 400, 64] - - [660, 5429.98] + - [732, 5429.98] - - [1024, 3565, 1, 4096] - - [643, 8532.8] + - [715, 8532.8] - - [4096, 3422, 1, 1024] - - [628, 9459.24] + - [700, 9459.24] - - [1024, 3214, 1, 4096] - - [648, 8064.92] + - [720, 8064.92] - - [33708, 3584, 1, 1024] - - [629, 10129.0] + - [701, 10129.0] - - [33708, 3640, 1, 1024] - - [626, 9919.22] + - [698, 9919.22] - - [4096, 3263, 1, 1024] - - [626, 9699.35] + - [698, 9699.35] - - [4096, 3296, 1, 1024] - - [626, 9780.8] + - [698, 9780.8] - - [1024, 3557, 1, 4096] - - [647, 8526.89] + - [719, 8526.89] - - [4096, 3463, 1, 1024] - - [626, 9578.13] + - [698, 9578.13] - - [4096, 3528, 1, 1024] - - [626, 9739.92] + - [698, 9739.92] - - [14, 14, 4368, 64] - - [658, 991.276] + - [730, 991.276] - - [4096, 3226, 1, 1024] - - [626, 9587.19] + - [698, 9587.19] - - [4096, 3439, 1, 1024] - - [629, 9499.72] + - [701, 9499.72] - - [1024, 3523, 1, 4096] - - [649, 8393.58] + - [721, 8393.58] - - [1024, 3098, 1, 4096] - - [655, 7882.87] + - [727, 7882.87] - - [4096, 3121, 1, 1024] - - [626, 9296.23] + - [698, 9296.23] - - [33708, 3894, 1, 1024] - - [627, 9952.27] + - [699, 9952.27] - - [1024, 3548, 1, 4096] - - [633, 8432.45] + - [705, 8432.45] - - [1024, 3451, 1, 4096] - - [646, 8456.44] + - [718, 8456.44] - - [4096, 3353, 1, 1024] - - [628, 9289.08] + - [700, 9289.08] - - [4096, 3402, 1, 1024] - - [628, 9406.44] + - [700, 9406.44] - - [4096, 3939, 1, 1024] - - [626, 9549.59] + - [698, 9549.59] - - [133, 133, 480, 64] - - [673, 4124.31] + - [745, 4124.31] - - [1024, 3559, 1, 4096] - - [648, 8587.04] + - [720, 8587.04] - - [1024, 2977, 1, 4096] - - [631, 9084.59] + - [703, 9084.59] - - [1024, 3478, 1, 4096] - - [642, 8342.85] + - [714, 8342.85] - - [134, 134, 480, 64] - - [675, 4204.43] + - [747, 4204.43] - - [1024, 3368, 1, 4096] - - [648, 8277.43] + - [720, 8277.43] - - [4096, 4012, 1, 1024] - - [628, 9726.57] + - [700, 9726.57] - - [4096, 3486, 1, 1024] - - [626, 9639.71] + - [698, 9639.71] - - [1024, 3479, 1, 4096] - - [636, 8420.37] + - [708, 8420.37] - - [1024, 3505, 1, 4096] - - [648, 8310.66] + - [720, 8310.66] - - [4096, 3381, 1, 1024] - - [629, 9357.75] + - [701, 9357.75] - - [4096, 3430, 1, 1024] - - [626, 9482.36] + - [698, 9482.36] - - [1024, 3554, 1, 4096] - - [648, 8592.38] + - [720, 8592.38] - - [4096, 3271, 1, 1024] - - [626, 9715.41] + - [698, 9715.41] - - [1024, 3063, 1, 4096] - - [630, 9388.56] + - [702, 9388.56] - - [1024, 3209, 1, 4096] - - [648, 8212.74] + - [720, 8212.74] - - [4096, 3503, 1, 1024] - - [628, 9680.59] + - [700, 9680.59] - - [4096, 3344, 1, 1024] - - [626, 9268.55] + - [698, 9268.55] - - [1024, 3147, 1, 4096] - - [649, 8037.2] + - [721, 8037.2] - - [1024, 3322, 1, 4096] - - [647, 8356.32] + - [719, 8356.32] - - [1024, 3341, 1, 4096] - - [648, 8316.33] + - [720, 8316.33] - - [1024, 3516, 1, 4096] - - [630, 8397.12] + - [702, 8397.12] - - [102, 101, 624, 64] - - [661, 4709.59] + - [733, 4709.59] - - [1024, 3454, 1, 4096] - - [647, 8425.6] + - [719, 8425.6] - - [4096, 3969, 1, 1024] - - [628, 9640.15] + - [700, 9640.15] - - [4096, 3466, 1, 1024] - - [628, 9576.83] + - [700, 9576.83] - - [1024, 3999, 1, 1024] - - [631, 9207.15] + - [703, 9207.15] - - [1024, 4032, 1, 1024] - - [632, 9294.56] + - [704, 9294.56] - - [1024, 3403, 1, 4096] - - [646, 8357.97] + - [718, 8357.97] - - [4096, 3361, 1, 1024] - - [628, 9308.78] + - [700, 9308.78] - - [1024, 3527, 1, 4096] - - [647, 8512.19] + - [719, 8512.19] - - [1024, 3822, 1, 4096] - - [631, 8991.13] + - [703, 8991.13] - - [4096, 3315, 1, 1024] - - [626, 9834.96] + - [698, 9834.96] - - [232, 232, 272, 64] - - [660, 6481.62] + - [732, 6481.62] - - [1024, 3336, 1, 4096] - - [649, 8295.61] + - [721, 8295.61] - - [228, 232, 272, 64] - - [661, 6327.85] + - [733, 6327.85] - - [4096, 3547, 1, 1024] - - [626, 9781.56] + - [698, 9781.56] - - [4096, 3340, 1, 1024] - - [628, 9269.72] + - [700, 9269.72] - - [1024, 3906, 1, 1024] - - [632, 9018.38] + - [704, 9018.38] - - [1024, 3295, 1, 4096] - - [646, 8194.83] + - [718, 8194.83] - - [4096, 3294, 1, 1024] - - [629, 9762.16] + - [701, 9762.16] - - [33708, 3968, 1, 1024] - - [629, 10147.8] + - [701, 10147.8] - - [1024, 3473, 1, 4096] - - [635, 8318.68] + - [707, 8318.68] - - [1024, 3072, 1, 4096] - - [632, 9370.13] + - [704, 9370.13] - - [4096, 3189, 1, 1024] - - [626, 9470.26] + - [698, 9470.26] - - [4096, 3494, 1, 1024] - - [626, 9661.32] + - [698, 9661.32] - - [1024, 3522, 1, 4096] - - [649, 8459.23] + - [721, 8459.23] - - [33708, 3944, 1, 1024] - - [629, 10060.2] + - [701, 10060.2] - - [135, 135, 480, 64] - - [674, 4257.03] + - [746, 4257.03] - - [4096, 3421, 1, 1024] - - [626, 9456.98] + - [698, 9456.98] - - [32, 32, 1984, 64] - - [671, 3436.24] + - [743, 3436.24] - - [4096, 3311, 1, 1024] - - [626, 9810.88] + - [698, 9810.88] - - [1024, 3990, 1, 1024] - - [633, 9197.74] + - [705, 9197.74] - - [1024, 3290, 1, 4096] - - [646, 8229.63] + - [718, 8229.63] - - [4096, 3565, 1, 1024] - - [627, 9824.48] + - [699, 9824.48] - - [1024, 3484, 1, 4096] - - [636, 8575.38] + - [708, 8575.38] - - [4096, 3384, 1, 1024] - - [626, 9366.54] + - [698, 9366.54] - - [1024, 3422, 1, 4096] - - [646, 8484.12] + - [718, 8484.12] - - [4096, 3681, 1, 1024] - - [627, 9520.16] + - [699, 9520.16] - - [1024, 3584, 1, 1024] - - [653, 8583.37] + - [725, 8583.37] - - [4096, 4050, 1, 1024] - - [628, 9807.35] + - [700, 9807.35] - - [1024, 3996, 1, 4096] - - [629, 9181.7] + - [701, 9181.7] - - [4096, 3169, 1, 1024] - - [627, 9411.4] + - [699, 9411.4] - - [4096, 3538, 1, 1024] - - [627, 9765.99] + - [699, 9765.99] - - [1024, 3495, 1, 4096] - - [633, 8295.95] + - [705, 8295.95] - - [4096, 3401, 1, 1024] - - [626, 9402.68] + - [698, 9402.68] - - [1024, 3560, 1, 4096] - - [647, 8513.45] + - [719, 8513.45] - - [133, 135, 480, 64] - - [674, 4199.08] + - [746, 4199.08] - - [1024, 3263, 1, 4096] - - [648, 8172.23] + - [720, 8172.23] - - [1024, 3870, 1, 4096] - - [628, 8996.27] + - [700, 8996.27] - - [4096, 3555, 1, 1024] - - [629, 9811.88] + - [701, 9811.88] - - [4096, 3412, 1, 1024] - - [626, 9432.09] + - [698, 9432.09] - - [101, 101, 624, 64] - - [660, 4667.69] + - [732, 4667.69] - - [1024, 3296, 1, 4096] - - [647, 8350.61] + - [719, 8350.61] - - [1024, 3379, 1, 4096] - - [649, 8432.94] + - [721, 8432.94] - - [4096, 3302, 1, 1024] - - [626, 9796.39] + - [698, 9796.39] - - [1024, 3490, 1, 4096] - - [646, 8538.44] + - [718, 8538.44] - - [1024, 3428, 1, 4096] - - [647, 8531.67] + - [719, 8531.67] - - [1024, 3976, 1, 4096] - - [628, 9327.87] + - [700, 9327.87] - - [4096, 3485, 1, 1024] - - [626, 9628.82] + - [698, 9628.82] - - [4096, 3534, 1, 1024] - - [626, 9755.97] + - [698, 9755.97] - - [1024, 3064, 1, 4096] - - [632, 9196.98] + - [704, 9196.98] - - [4096, 3216, 1, 1024] - - [628, 9563.44] + - [700, 9563.44] - - [1024, 3450, 1, 4096] - - [656, 8519.29] + - [728, 8519.29] - - [1024, 3533, 1, 4096] - - [647, 8495.77] + - [719, 8495.77] - - [1024, 4030, 1, 1024] - - [632, 9304.68] + - [704, 9304.68] - - [1024, 3311, 1, 4096] - - [647, 8278.6] + - [719, 8278.6] - - [1024, 3468, 1, 4096] - - [638, 8564.55] + - [710, 8564.55] - - [23, 23, 2720, 64] - - [662, 2311.55] + - [734, 2311.55] - - [4096, 3359, 1, 1024] - - [628, 9309.15] + - [700, 9309.15] - - [4096, 3392, 1, 1024] - - [628, 9388.19] + - [700, 9388.19] - - [1024, 3925, 1, 1024] - - [630, 9006.72] + - [702, 9006.72] - - [4096, 3233, 1, 1024] - - [626, 9603.64] + - [698, 9603.64] - - [4096, 3956, 1, 1024] - - [627, 9581.94] + - [699, 9581.94] - - [1024, 3463, 1, 4096] - - [648, 8293.97] + - [720, 8293.97] - - [1024, 3126, 1, 4096] - - [647, 7978.13] + - [719, 7978.13] - - [1024, 3363, 1, 4096] - - [640, 8267.47] + - [712, 8267.47] - - [4096, 3465, 1, 1024] - - [626, 9590.74] + - [698, 9590.74] - - [33708, 3996, 1, 1024] - - [627, 9899.99] + - [699, 9899.99] - - [1024, 3231, 1, 4096] - - [648, 8231.68] + - [720, 8231.68] - - [33708, 3978, 1, 1024] - - [627, 9853.64] + - [699, 9853.64] - - [4096, 3476, 1, 1024] - - [626, 9616.62] + - [698, 9616.62] - - [85, 85, 752, 64] - - [658, 4240.65] + - [730, 4240.65] - - [4096, 3339, 1, 1024] - - [628, 9249.81] + - [700, 9249.81] - - [4096, 3452, 1, 1024] - - [626, 9534.13] + - [698, 9534.13] - - [1024, 3396, 1, 4096] - - [647, 8451.23] + - [719, 8451.23] - - [4096, 3293, 1, 1024] - - [628, 9775.22] + - [700, 9775.22] - - [54, 54, 1184, 64] - - [660, 4153.54] + - [732, 4153.54] - - [1024, 3432, 1, 4096] - - [641, 8345.53] + - [713, 8345.53] - - [4096, 3493, 1, 1024] - - [629, 9649.9] + - [701, 9649.9] - - [4096, 3350, 1, 1024] - - [628, 9273.91] + - [700, 9273.91] - - [1024, 3079, 1, 4096] - - [656, 7775.66] + - [728, 7775.66] - - [1024, 3101, 1, 4096] - - [656, 7847.85] + - [728, 7847.85] - - [33708, 3939, 1, 1024] - - [629, 10054.4] + - [701, 10054.4] - - [4096, 3256, 1, 1024] - - [628, 9681.83] + - [700, 9681.83] - - [1024, 3439, 1, 4096] - - [647, 8531.11] + - [719, 8531.11] - - [1024, 3510, 1, 4096] - - [646, 8422.31] + - [718, 8422.31] - - [4096, 3900, 1, 1024] - - [627, 9468.61] + - [699, 9468.61] - - [1024, 3470, 1, 4096] - - [648, 8507.77] + - [720, 8507.77] - - [4096, 3456, 1, 1024] - - [628, 9577.46] + - [700, 9577.46] - - [4096, 3014, 1, 1024] - - [627, 9666.15] + - [699, 9666.15] - - [4096, 3367, 1, 1024] - - [629, 9328.36] + - [701, 9328.36] - - [4096, 3432, 1, 1024] - - [626, 9480.88] + - [698, 9480.88] - - [33708, 4026, 1, 1024] - - [629, 9972.83] + - [701, 9972.83] - - [4096, 3273, 1, 1024] - - [626, 9716.95] + - [698, 9716.95] - - [4096, 3130, 1, 1024] - - [626, 9311.4] + - [698, 9311.4] - - [1024, 3496, 1, 4096] - - [637, 8434.65] + - [709, 8434.65] - - [1024, 3995, 1, 4096] - - [622, 9157.73] + - [694, 9157.73] - - [1024, 3939, 1, 4096] - - [630, 9059.86] + - [702, 9059.86] - - [1024, 3121, 1, 4096] - - [654, 7963.43] + - [726, 7963.43] - - [1024, 3232, 1, 4096] - - [648, 8061.09] + - [720, 8061.09] - - [4096, 3147, 1, 1024] - - [628, 9364.63] + - [700, 9364.63] - - [4096, 3516, 1, 1024] - - [626, 9708.84] + - [698, 9708.84] - - [1024, 3969, 1, 1024] - - [632, 9168.68] + - [704, 9168.68] - - [1024, 3364, 1, 4096] - - [636, 8363.65] + - [708, 8363.65] - - [4096, 3411, 1, 1024] - - [629, 9442.77] + - [701, 9442.77] - - [147, 147, 432, 64] - - [673, 4843.21] + - [745, 4843.21] - - [4096, 3301, 1, 1024] - - [628, 9783.46] + - [700, 9783.46] - - [112, 111, 576, 64] - - [660, 5627.47] + - [732, 5627.47] - - [1024, 3513, 1, 4096] - - [647, 8725.41] + - [719, 8725.41] - - [1024, 3469, 1, 4096] - - [627, 8183.11] + - [699, 8183.11] - - [1024, 3095, 1, 4096] - - [648, 7887.87] + - [720, 7887.87] - - [4096, 3533, 1, 1024] - - [627, 9755.27] + - [699, 9755.27] - - [4096, 3390, 1, 1024] - - [626, 9377.21] + - [698, 9377.21] - - [4096, 3582, 1, 1024] - - [626, 9874.96] + - [698, 9874.96] - - [1024, 3956, 1, 1024] - - [632, 9058.82] + - [704, 9058.82] - - [4096, 3585, 1, 1024] - - [628, 9289.75] + - [700, 9289.75] - - [4096, 3231, 1, 1024] - - [627, 9597.15] + - [699, 9597.15] - - [1024, 3205, 1, 4096] - - [646, 8073.25] + - [718, 8073.25] - - [4096, 3496, 1, 1024] - - [627, 9668.38] + - [699, 9668.38] - - [1024, 3143, 1, 4096] - - [646, 8031.68] + - [718, 8031.68] - - [1024, 3318, 1, 4096] - - [643, 8261.43] + - [715, 8261.43] - - [1024, 3353, 1, 4096] - - [647, 8414.92] + - [719, 8414.92] - - [1024, 3464, 1, 4096] - - [646, 8310.03] + - [718, 8310.03] - - [4096, 2736, 1, 1024] - - [628, 9563.12] + - [700, 9563.12] - - [1024, 3402, 1, 4096] - - [643, 8413.84] + - [715, 8413.84] - - [4096, 3138, 1, 1024] - - [628, 9342.09] + - [700, 9342.09] - - [1024, 3860, 1, 4096] - - [631, 9008.57] + - [703, 9008.57] - - [148, 148, 432, 64] - - [673, 4915.7] + - [745, 4915.7] - - [1024, 3539, 1, 4096] - - [643, 8449.36] + - [715, 8449.36] - - [4096, 3211, 1, 1024] - - [628, 9551.28] + - [700, 9551.28] - - [1024, 3332, 1, 4096] - - [636, 8295.11] + - [708, 8295.11] - - [1024, 3466, 1, 4096] - - [647, 8339.25] + - [719, 8339.25] - - [4096, 3475, 1, 1024] - - [626, 9612.33] + - [698, 9612.33] - - [4096, 3524, 1, 1024] - - [629, 9722.74] + - [701, 9722.74] - - [4096, 2985, 1, 1024] - - [629, 9591.33] + - [701, 9591.33] - - [4096, 3222, 1, 1024] - - [626, 9577.48] + - [698, 9577.48] - - [4096, 3451, 1, 1024] - - [628, 9541.42] + - [700, 9541.42] - - [1024, 3181, 1, 4096] - - [646, 8118.89] + - [718, 8118.89] - - [1024, 3640, 1, 4096] - - [631, 8617.11] + - [703, 8617.11] - - [1024, 3375, 1, 4096] - - [635, 8419.75] + - [707, 8419.75] - - [1024, 3550, 1, 4096] - - [648, 8512.83] + - [720, 8512.83] - - [1024, 4020, 1, 1024] - - [632, 9266.9] + - [704, 9266.9] - - [1024, 3840, 1, 4096] - - [631, 8983.49] + - [703, 8983.49] - - [4096, 3349, 1, 1024] - - [626, 9279.96] + - [698, 9279.96] - - [4096, 3398, 1, 1024] - - [627, 9402.32] + - [699, 9402.32] - - [33708, 3976, 1, 1024] - - [628, 9849.54] + - [700, 9849.54] - - [1024, 2917, 1, 4096] - - [633, 8936.87] + - [705, 8936.87] - - [33708, 3910, 1, 1024] - - [626, 9983.35] + - [698, 9983.35] - - [4096, 3860, 1, 1024] - - [627, 9377.58] + - [699, 9377.58] - - [4096, 3304, 1, 1024] - - [629, 9798.44] + - [701, 9798.44] - - [1024, 3286, 1, 4096] - - [634, 8167.41] + - [706, 8167.41] - - [1024, 3460, 1, 4096] - - [644, 8539.56] + - [716, 8539.56] - - [1024, 4026, 1, 4096] - - [630, 9305.68] + - [702, 9305.68] - - [4096, 3471, 1, 1024] - - [628, 9596.71] + - [700, 9596.71] - - [193, 193, 320, 64] - - [676, 4758.46] + - [748, 4758.46] - - [1024, 3894, 1, 1024] - - [630, 8979.6] + - [702, 8979.6] - - [65, 65, 992, 64] - - [672, 2565.49] + - [744, 2565.49] - - [1024, 3506, 1, 4096] - - [644, 8593.22] + - [716, 8593.22] - - [35, 35, 1808, 64] - - [666, 2129.72] + - [738, 2129.72] - - [1024, 4000, 1, 1024] - - [630, 9204.6] + - [702, 9204.6] - - [1024, 3900, 1, 4096] - - [626, 9050.36] + - [698, 9050.36] - - [1024, 3445, 1, 4096] - - [649, 8551.65] + - [721, 8551.65] - - [4096, 3442, 1, 1024] - - [627, 9505.0] + - [699, 9505.0] - - [1024, 3358, 1, 4096] - - [648, 8437.16] + - [720, 8437.16] - - [13, 13, 4672, 64] - - [659, 860.665] + - [731, 860.665] - - [1024, 3211, 1, 4096] - - [652, 8085.25] + - [724, 8085.25] - - [4096, 3515, 1, 1024] - - [628, 9715.29] + - [700, 9715.29] - - [1024, 3564, 1, 4096] - - [634, 8760.37] + - [706, 8760.37] - - [4096, 3057, 1, 1024] - - [628, 9804.05] + - [700, 9804.05] - - [1024, 3343, 1, 4096] - - [646, 8363.8] + - [718, 8363.8] - - [4096, 3262, 1, 1024] - - [627, 9686.49] + - [699, 9686.49] - - [1024, 3518, 1, 4096] - - [646, 8455.05] + - [718, 8455.05] - - [77, 77, 816, 64] - - [665, 3505.94] + - [737, 3505.94] - - [33708, 3876, 1, 1024] - - [627, 9895.95] + - [699, 9895.95] - - [4096, 3462, 1, 1024] - - [628, 9570.31] + - [700, 9570.31] - - [1024, 3265, 1, 4096] - - [646, 8322.75] + - [718, 8322.75] - - [4096, 3389, 1, 1024] - - [627, 9382.86] + - [699, 9382.86] - - [4096, 3438, 1, 1024] - - [628, 9503.47] + - [700, 9503.47] - - [1024, 3955, 1, 1024] - - [630, 9064.45] + - [702, 9064.45] - - [1024, 3545, 1, 4096] - - [649, 8652.41] + - [721, 8652.41] - - [1024, 3144, 1, 4096] - - [649, 8060.55] + - [721, 8060.55] - - [1024, 3417, 1, 4096] - - [647, 8505.91] + - [719, 8505.91] - - [4096, 3543, 1, 1024] - - [626, 9775.67] + - [698, 9775.67] - - [4096, 3352, 1, 1024] - - [628, 9282.87] + - [700, 9282.87] - - [33708, 3975, 1, 1024] - - [629, 9849.49] + - [701, 9849.49] - - [148, 147, 432, 64] - - [673, 4876.15] + - [745, 4876.15] - - [4096, 3137, 1, 1024] - - [626, 9330.63] + - [698, 9330.63] - - [4096, 3506, 1, 1024] - - [629, 9682.76] + - [701, 9682.76] - - [1024, 3975, 1, 1024] - - [632, 9164.77] + - [704, 9164.77] - - [1024, 3859, 1, 4096] - - [630, 8983.84] + - [702, 8983.84] - - [4096, 3369, 1, 1024] - - [628, 9330.45] + - [700, 9330.45] - - [1024, 3434, 1, 4096] - - [646, 8486.98] + - [718, 8486.98] - - [1024, 3292, 1, 4096] - - [646, 8478.96] + - [718, 8478.96] - - [4096, 3523, 1, 1024] - - [626, 9734.83] + - [698, 9734.83] - - [4096, 3380, 1, 1024] - - [628, 9354.49] + - [700, 9354.49] - - [1024, 3408, 1, 4096] - - [649, 8441.03] + - [721, 8441.03] - - [4096, 3221, 1, 1024] - - [628, 9575.59] + - [700, 9575.59] - - [4096, 3270, 1, 1024] - - [628, 9717.95] + - [700, 9717.95] - - [143, 143, 432, 64] - - [674, 4643.45] + - [746, 4643.45] - - [111, 111, 576, 64] - - [666, 5475.04] + - [738, 5475.04] - - [1024, 3303, 1, 4096] - - [648, 8413.07] + - [720, 8413.07] - - [4096, 3502, 1, 1024] - - [628, 9679.87] + - [700, 9679.87] - - [1024, 3222, 1, 4096] - - [648, 8141.88] + - [720, 8141.88] - - [4096, 2505, 1, 1024] - - [626, 9594.95] + - [698, 9594.95] - - [4096, 3397, 1, 1024] - - [626, 9392.61] + - [698, 9392.61] - - [4096, 3562, 1, 1024] - - [626, 9827.58] + - [698, 9827.58] - - [4096, 3095, 1, 1024] - - [628, 9222.45] + - [700, 9222.45] - - [1024, 3226, 1, 4096] - - [644, 8027.03] + - [716, 8027.03] - - [177, 177, 352, 64] - - [661, 6406.96] + - [733, 6406.96] - - [4096, 3360, 1, 1024] - - [627, 9298.15] + - [699, 9298.15] - - [1024, 3942, 1, 1024] - - [632, 9061.59] + - [704, 9061.59] - - [1024, 3298, 1, 4096] - - [649, 8254.36] + - [721, 8254.36] - - [1024, 3381, 1, 4096] - - [648, 8508.81] + - [720, 8508.81] - - [4096, 3314, 1, 1024] - - [628, 9837.56] + - [700, 9837.56] - - [1024, 3492, 1, 4096] - - [636, 8583.39] + - [708, 8583.39] - - [1024, 3430, 1, 4096] - - [636, 8492.71] + - [708, 8492.71] - - [4096, 3977, 1, 1024] - - [628, 9656.45] + - [700, 9656.45] - - [4096, 3546, 1, 1024] - - [626, 9780.35] + - [698, 9780.35] - - [4096, 3640, 1, 1024] - - [626, 9415.51] + - [698, 9415.51] - - [4096, 3441, 1, 1024] - - [627, 9499.24] + - [699, 9499.24] - - [33708, 4059, 1, 1024] - - [629, 10051.9] + - [701, 10051.9] - - [1024, 3978, 1, 1024] - - [630, 9158.8] + - [702, 9158.8] - - [1024, 3376, 1, 4096] - - [648, 8415.44] + - [720, 8415.44] - - [1024, 3482, 1, 4096] - - [649, 8396.62] + - [721, 8396.62] - - [1024, 3563, 1, 4096] - - [632, 8424.18] + - [704, 8424.18] - - [4096, 4020, 1, 1024] - - [629, 9745.96] + - [701, 9745.96] - - [1024, 3271, 1, 4096] - - [647, 8289.68] + - [719, 8289.68] - - [1024, 3291, 1, 4096] - - [647, 8222.71] + - [719, 8222.71] - - [1024, 3431, 1, 4096] - - [642, 8464.4] + - [714, 8464.4] - - [1024, 3481, 1, 4096] - - [648, 8386.5] + - [720, 8386.5] - - [84, 85, 752, 64] - - [663, 4194.85] + - [735, 4194.85] - - [4096, 3461, 1, 1024] - - [626, 9579.67] + - [698, 9579.67] - - [1024, 3574, 1, 4096] - - [649, 8579.8] + - [721, 8579.8] - - [1024, 4059, 1, 1024] - - [630, 9330.54] + - [702, 9330.54] - - [84, 84, 752, 64] - - [670, 4141.46] + - [742, 4141.46] - - [1024, 3421, 1, 4096] - - [649, 8528.42] + - [721, 8528.42] - - [4096, 3224, 1, 1024] - - [628, 9589.95] + - [700, 9589.95] - - [4096, 3437, 1, 1024] - - [628, 9498.2] + - [700, 9498.2] - - [45, 45, 1424, 64] - - [660, 3314.58] + - [732, 3314.58] - - [4096, 3840, 1, 1024] - - [626, 9931.37] + - [698, 9931.37] - - [4096, 3168, 1, 1024] - - [628, 9412.16] + - [700, 9412.16] - - [33708, 3990, 1, 1024] - - [626, 9884.39] + - [698, 9884.39] - - [1024, 3349, 1, 4096] - - [648, 8421.4] + - [720, 8421.4] - - [4096, 3335, 1, 1024] - - [626, 9241.65] + - [698, 9241.65] - - [4096, 3400, 1, 1024] - - [628, 9407.35] + - [700, 9407.35] - - [160, 159, 400, 64] - - [675, 5708.94] + - [747, 5708.94] - - [1024, 3398, 1, 4096] - - [648, 8624.03] + - [720, 8624.03] - - [1024, 3780, 1, 4096] - - [628, 8756.78] + - [700, 8756.78] - - [29, 29, 2176, 64] - - [671, 2963.69] + - [743, 2963.69] - - [4096, 3098, 1, 1024] - - [626, 9229.82] + - [698, 9229.82] - - [1024, 4012, 1, 4096] - - [632, 9422.03] + - [704, 9422.03] - - [4096, 3505, 1, 1024] - - [628, 9687.65] + - [700, 9687.65] - - [4096, 3554, 1, 1024] - - [628, 9812.22] + - [700, 9812.22] - - [4096, 3063, 1, 1024] - - [628, 9825.1] + - [700, 9825.1] - - [1024, 3503, 1, 4096] - - [646, 8404.74] + - [718, 8404.74] - - [1024, 3166, 1, 4096] - - [649, 8084.93] + - [721, 8084.93] - - [1024, 3425, 1, 4096] - - [649, 8537.58] + - [721, 8537.58] - - [1024, 3344, 1, 4096] - - [640, 8351.16] + - [712, 8351.16] - - [4096, 3484, 1, 1024] - - [628, 9635.7] + - [700, 9635.7] - - [1024, 3681, 1, 1024] - - [631, 8457.18] + - [703, 8457.18] - - [1024, 4050, 1, 1024] - - [632, 9326.21] + - [704, 9326.21] - - [4096, 3379, 1, 1024] - - [626, 9356.16] + - [698, 9356.16] - - [4096, 3428, 1, 1024] - - [627, 9472.33] + - [699, 9472.33] - - [12, 12, 5040, 64] - - [665, 741.617] + - [737, 741.617] - - [27, 27, 2336, 64] - - [671, 2757.9] + - [743, 2757.9] - - [1024, 3304, 1, 4096] - - [649, 8317.82] + - [721, 8317.82] - - [1024, 3387, 1, 4096] - - [647, 8460.15] + - [719, 8460.15] - - [4096, 3126, 1, 1024] - - [629, 9308.48] + - [701, 9308.48] - - [1024, 3498, 1, 4096] - - [646, 8485.55] + - [718, 8485.55] - - [1024, 3436, 1, 4096] - - [648, 8397.71] + - [720, 8397.71] - - [4096, 3501, 1, 1024] - - [626, 9681.19] + - [698, 9681.19] - - [4096, 3358, 1, 1024] - - [628, 9304.9] + - [700, 9304.9] - - [4096, 3232, 1, 1024] - - [626, 9607.2] + - [698, 9607.2] - - [1024, 3585, 1, 4096] - - [630, 8510.74] + - [702, 8510.74] - - [4096, 3143, 1, 1024] - - [629, 9355.91] + - [701, 9355.91] - - [4096, 3464, 1, 1024] - - [628, 9585.95] + - [700, 9585.95] - - [1024, 3366, 1, 4096] - - [636, 8275.23] + - [708, 8275.23] - - [4096, 3375, 1, 1024] - - [626, 9342.13] + - [698, 9342.13] - - [4096, 2917, 1, 1024] - - [626, 9372.84] + - [698, 9372.84] - - [4096, 4026, 1, 1024] - - [628, 9759.15] + - [700, 9759.15] - - [49, 49, 1296, 64] - - [667, 3710.02] + - [739, 3710.02] - - [1024, 3277, 1, 4096] - - [647, 8217.1] + - [719, 8217.1] - - [1024, 3103, 1, 4096] - - [648, 7872.67] + - [720, 7872.67] - - [33708, 3995, 1, 1024] - - [628, 9893.08] + - [700, 9893.08] - - [1024, 3297, 1, 4096] - - [647, 8185.82] + - [719, 8185.82] - - [4096, 3545, 1, 1024] - - [628, 9789.43] + - [700, 9789.43] - - [1024, 3399, 1, 4096] - - [647, 8377.18] + - [719, 8377.18] - - [33708, 3796, 1, 1024] - - [627, 10008.0] + - [699, 10008.0] - - [4096, 3292, 1, 1024] - - [628, 9767.28] + - [700, 9767.28] - - [71, 71, 896, 64] - - [662, 3006.25] + - [734, 3006.25] - - [33708, 3859, 1, 1024] - - [629, 9860.37] + - [701, 9860.37] - - [4096, 3566, 1, 1024] - - [628, 9834.47] + - [700, 9834.47] - - [4096, 3894, 1, 1024] - - [626, 9456.67] + - [698, 9456.67] - - [4096, 3492, 1, 1024] - - [626, 9653.24] + - [698, 9653.24] - - [1024, 3977, 1, 1024] - - [632, 9161.33] + - [704, 9161.33] - - [1024, 3272, 1, 4096] - - [649, 8257.09] + - [721, 8257.09] - - [135, 134, 480, 64] - - [673, 4238.39] + - [745, 4238.39] - - [1024, 3355, 1, 4096] - - [647, 8374.64] + - [719, 8374.64] - - [4096, 3419, 1, 1024] - - [629, 9455.44] + - [701, 9455.44] - - [1024, 3404, 1, 4096] - - [648, 8580.28] + - [720, 8580.28] - - [4096, 3999, 1, 1024] - - [628, 9701.78] + - [700, 9701.78] - - [4096, 3166, 1, 1024] - - [626, 9410.48] + - [698, 9410.48] - - [33708, 3840, 1, 1024] - - [629, 10132.9] + - [701, 10132.9] - - [4096, 4032, 1, 1024] - - [629, 9762.86] + - [701, 9762.86] - - [1024, 3573, 1, 4096] - - [647, 8603.4] + - [719, 8603.4] - - [4096, 3366, 1, 1024] - - [629, 9322.63] + - [701, 9322.63] - - [1024, 3541, 1, 4096] - - [649, 8405.9] + - [721, 8405.9] - - [4096, 3207, 1, 1024] - - [626, 9544.25] + - [698, 9544.25] - - [4096, 3272, 1, 1024] - - [628, 9716.73] + - [700, 9716.73] - - [1024, 3334, 1, 4096] - - [646, 8241.39] + - [718, 8241.39] - - [228, 228, 272, 64] - - [661, 6232.45] + - [733, 6232.45] - - [4096, 3183, 1, 1024] - - [628, 9452.44] + - [700, 9452.44] - - [4096, 3536, 1, 1024] - - [627, 9759.44] + - [699, 9759.44] - - [1024, 4005, 1, 1024] - - [631, 9225.83] + - [703, 9225.83] - - [1024, 3245, 1, 4096] - - [648, 8074.31] + - [720, 8074.31] - - [4096, 3447, 1, 1024] - - [627, 9525.84] + - [699, 9525.84] - - [1024, 3183, 1, 4096] - - [647, 8121.62] + - [719, 8121.62] - - [1024, 3361, 1, 4096] - - [649, 8285.86] + - [721, 8285.86] - - [33708, 3870, 1, 1024] - - [627, 9879.35] + - [699, 9879.35] - - [1024, 3321, 1, 4096] - - [648, 8408.67] + - [720, 8408.67] - - [1024, 3968, 1, 1024] - - [630, 9202.05] + - [702, 9202.05] - - [1024, 3486, 1, 4096] - - [644, 8258.89] + - [716, 8258.89] - - [4096, 4005, 1, 1024] - - [628, 9723.98] + - [700, 9723.98] - - [4096, 3410, 1, 1024] - - [629, 9440.5] + - [701, 9440.5] - - [1024, 3944, 1, 1024] - - [632, 9040.82] + - [704, 9040.82] - - [4096, 3300, 1, 1024] - - [627, 9789.9] + - [699, 9789.9] - - [4096, 3579, 1, 1024] - - [629, 9859.44] + - [701, 9859.44] - - [4096, 3483, 1, 1024] - - [629, 9624.31] + - [701, 9624.31] - - [4096, 3532, 1, 1024] - - [628, 9742.76] + - [700, 9742.76] - - [1024, 3140, 1, 4096] - - [648, 7899.65] + - [720, 7899.65] - - [1024, 3372, 1, 4096] - - [646, 8237.07] + - [718, 8237.07] - - [1024, 3224, 1, 4096] - - [649, 8159.13] + - [721, 8159.13] - - [4096, 3230, 1, 1024] - - [628, 9601.25] + - [700, 9601.25] - - [4096, 3427, 1, 1024] - - [628, 9466.57] + - [700, 9466.57] - - [1024, 3796, 1, 1024] - - [632, 8739.78] + - [704, 8739.78] - - [143, 148, 432, 64] - - [673, 4762.0] + - [745, 4762.0] - - [1024, 3616, 1, 4096] - - [631, 8445.89] + - [703, 8445.89] - - [1024, 3315, 1, 4096] - - [648, 8403.21] + - [720, 8403.21] - - [1024, 3476, 1, 4096] - - [646, 8523.68] + - [718, 8523.68] - - [1024, 3509, 1, 4096] - - [646, 8345.05] + - [718, 8345.05] - - [4096, 3357, 1, 1024] - - [628, 9300.16] + - [700, 9300.16] - - [4096, 3406, 1, 1024] - - [628, 9427.44] + - [700, 9427.44] - - [1024, 3558, 1, 4096] - - [647, 8525.78] + - [719, 8525.78] - - [4096, 3593, 1, 1024] - - [628, 9302.2] + - [700, 9302.2] - - [4096, 3247, 1, 1024] - - [628, 9648.5] + - [700, 9648.5] - - [4096, 3088, 1, 1024] - - [628, 9204.21] + - [700, 9204.21] - - [1024, 3213, 1, 4096] - - [646, 8054.31] + - [718, 8054.31] - - [4096, 3511, 1, 1024] - - [626, 9702.7] + - [698, 9702.7] - - [122, 122, 528, 64] - - [667, 6293.39] + - [739, 6293.39] - - [1024, 3365, 1, 4096] - - [643, 8413.62] + - [715, 8413.62] - - [1024, 3504, 1, 4096] - - [645, 8414.46] + - [717, 8414.46] - - [1024, 3442, 1, 4096] - - [648, 8684.0] + - [720, 8684.0] - - [4096, 3474, 1, 1024] - - [626, 9611.6] + - [698, 9611.6] - - [4096, 2984, 1, 1024] - - [627, 9592.82] + - [699, 9592.82] - - [1024, 3876, 1, 4096] - - [630, 9085.95] + - [702, 9085.95] - - [4096, 3337, 1, 1024] - - [628, 9246.22] + - [700, 9246.22] - - [4096, 3450, 1, 1024] - - [628, 9534.63] + - [700, 9534.63] - - [1024, 3547, 1, 4096] - - [648, 8386.73] + - [720, 8386.73] - - [4096, 3291, 1, 1024] - - [627, 9759.34] + - [699, 9759.34] - - [1024, 3340, 1, 4096] - - [647, 8237.97] + - [719, 8237.97] - - [4096, 3491, 1, 1024] - - [628, 9656.59] + - [700, 9656.59] - - [4096, 3348, 1, 1024] - - [628, 9279.15] + - [700, 9279.15] - - [78, 78, 816, 64] - - [668, 3591.09] + - [740, 3591.09] - - [4096, 3968, 1, 1024] - - [629, 9642.19] + - [701, 9642.19] - - [4096, 3906, 1, 1024] - - [629, 9485.37] + - [701, 9485.37] - - [1024, 3477, 1, 4096] - - [636, 8389.2] + - [708, 8389.2] - - [1024, 3397, 1, 4096] - - [646, 8556.88] + - [718, 8556.88] - - [4096, 3165, 1, 1024] - - [627, 9415.52] + - [699, 9415.52] - - [4096, 3470, 1, 1024] - - [626, 9598.5] + - [698, 9598.5] - - [1024, 3526, 1, 4096] - - [646, 8442.15] + - [718, 8442.15] - - [112, 112, 576, 64] - - [661, 5672.6] + - [733, 5672.6] - - [4096, 3365, 1, 1024] - - [626, 9321.83] + - [698, 9321.83] - - [4096, 3319, 1, 1024] - - [626, 9838.48] + - [698, 9838.48] - - [1024, 3401, 1, 4096] - - [648, 8460.86] + - [720, 8460.86] - - [1024, 3294, 1, 4096] - - [647, 8324.63] + - [719, 8324.63] - - [159, 159, 400, 64] - - [663, 5488.51] + - [735, 5488.51] - - [1024, 3472, 1, 4096] - - [641, 8289.77] + - [713, 8289.77] - - [4096, 3328, 1, 1024] - - [627, 9904.35] + - [699, 9904.35] - - [1024, 3861, 1, 1024] - - [632, 8917.63] + - [704, 8917.63] - - [1024, 3910, 1, 1024] - - [630, 9010.16] + - [702, 9010.16] - - [1024, 3410, 1, 4096] - - [648, 8519.63] + - [720, 8519.63] - - [1024, 3395, 1, 4096] - - [646, 8424.35] + - [718, 8424.35] - - [4096, 3282, 1, 1024] - - [626, 9743.67] + - [698, 9743.67] - - [1024, 3751, 1, 1024] - - [633, 8680.39] + - [705, 8680.39] - - [4096, 3145, 1, 1024] - - [628, 9353.37] + - [700, 9353.37] - - [4096, 3514, 1, 1024] - - [628, 9713.04] + - [700, 9713.04] - - [4096, 3944, 1, 1024] - - [628, 9563.92] + - [700, 9563.92] - - [1024, 3515, 1, 4096] - - [647, 8428.13] + - [719, 8428.13] - - [4096, 3409, 1, 1024] - - [627, 9428.77] + - [699, 9428.77] - - [4096, 3564, 1, 1024] - - [626, 9823.79] + - [698, 9823.79] - - [4096, 3299, 1, 1024] - - [628, 9793.03] + - [700, 9793.03] - - [1024, 3057, 1, 4096] - - [624, 9237.85] + - [696, 9237.85] - - [4096, 3531, 1, 1024] - - [626, 9745.64] + - [698, 9745.64] - - [4096, 3388, 1, 1024] - - [628, 9374.65] + - [700, 9374.65] - - [1024, 3189, 1, 4096] - - [648, 8084.6] + - [720, 8084.6] - - [1024, 3300, 1, 4096] - - [648, 8185.13] + - [720, 8185.13] - - [1024, 3720, 1, 4096] - - [627, 8755.11] + - [699, 8755.11] - - [1024, 3383, 1, 4096] - - [641, 8463.47] + - [713, 8463.47] - - [1024, 3494, 1, 4096] - - [648, 8676.57] + - [720, 8676.57] - - [77, 78, 816, 64] - - [664, 3548.26] + - [736, 3548.26] - - [1024, 3448, 1, 4096] - - [646, 8665.78] + - [718, 8665.78] - - [4096, 3542, 1, 1024] - - [626, 9771.88] + - [698, 9771.88] - - [1024, 3488, 1, 4096] - - [646, 8488.39] + - [718, 8488.39] - - [4096, 3405, 1, 1024] - - [628, 9426.16] + - [700, 9426.16] - - [1024, 3262, 1, 4096] - - [648, 8206.97] + - [720, 8206.97] - - [33708, 4005, 1, 1024] - - [629, 9928.16] + - [701, 9928.16] - - [1024, 3594, 1, 4096] - - [633, 8458.57] + - [705, 8458.57] - - [4096, 3103, 1, 1024] - - [629, 9243.14] + - [701, 9243.14] - - [4096, 3136, 1, 1024] - - [628, 9340.9] + - [700, 9340.9] - - [1024, 3378, 1, 4096] - - [649, 8432.45] + - [721, 8432.45] - - [10, 10, 5952, 64] - - [669, 523.353] + - [741, 523.353] - - [7, 7, 8192, 64] - - [669, 260.543] + - [741, 260.543] - - [4096, 3559, 1, 1024] - - [628, 9813.1] + - [700, 9813.1] - - [4096, 3368, 1, 1024] - - [629, 9328.66] + - [701, 9328.66] - - [4096, 3209, 1, 1024] - - [626, 9538.83] + - [698, 9538.83] - - [4096, 3322, 1, 1024] - - [628, 9839.58] + - [700, 9839.58] - - [1024, 3483, 1, 4096] - - [634, 8348.35] + - [706, 8348.35] - - [4096, 3473, 1, 1024] - - [627, 9605.79] + - [699, 9605.79] - - [4096, 3522, 1, 1024] - - [629, 9730.02] + - [701, 9730.02] - - [1024, 3532, 1, 4096] - - [647, 8474.32] + - [719, 8474.32] - - [4096, 3449, 1, 1024] - - [628, 9528.35] + - [700, 9528.35] - - [1024, 3351, 1, 4096] - - [649, 8311.23] + - [721, 8311.23] - - [1024, 3462, 1, 4096] - - [646, 8297.64] + - [718, 8297.64] - - [4096, 3396, 1, 1024] - - [628, 9400.25] + - [700, 9400.25] - - [132, 132, 480, 64] - - [674, 4089.84] + - [746, 4089.84] - - [111, 112, 576, 64] - - [660, 5529.7] + - [732, 5529.7] - - [1024, 3416, 1, 4096] - - [647, 8556.64] + - [719, 8556.64] - - [4096, 3469, 1, 1024] - - [629, 9598.77] + - [701, 9598.77] - - [1024, 3582, 1, 4096] - - [630, 8461.47] + - [702, 8461.47] - - [1024, 3230, 1, 4096] - - [647, 8188.94] + - [719, 8188.94] - - [1024, 3489, 1, 4096] - - [648, 8457.85] + - [720, 8457.85] - - [1024, 3427, 1, 4096] - - [648, 8566.59] + - [720, 8566.59] - - [1024, 3346, 1, 4096] - - [647, 8352.17] + - [719, 8352.17] - - [33708, 3977, 1, 1024] - - [629, 9868.5] + - [701, 9868.5] - - [4096, 3796, 1, 1024] - - [628, 9797.76] + - [700, 9797.76] - - [4096, 3176, 1, 1024] - - [628, 9435.39] + - [700, 9435.39] - - [4096, 3990, 1, 1024] - - [626, 9672.33] + - [698, 9672.33] - - [1024, 3257, 1, 4096] - - [649, 8225.17] + - [721, 8225.17] - - [4096, 3343, 1, 1024] - - [650, 9273.62] + - [722, 9273.62] - - [4096, 3440, 1, 1024] - - [626, 9501.48] + - [698, 9501.48] - - [33708, 4030, 1, 1024] - - [627, 9983.36] + - [699, 9983.36] - - [1024, 3190, 1, 4096] - - [648, 8192.11] + - [720, 8192.11] - - [1024, 3389, 1, 4096] - - [649, 8439.42] + - [721, 8439.42] - - [1024, 3500, 1, 4096] - - [647, 8556.12] + - [719, 8556.12] - - [1024, 3471, 1, 4096] - - [636, 8491.17] + - [708, 8491.17] - - [1024, 3438, 1, 4096] - - [649, 8567.95] + - [721, 8567.95] - - [4096, 3513, 1, 1024] - - [626, 9710.27] + - [698, 9710.27] - - [1024, 3562, 1, 4096] - - [641, 8608.94] + - [713, 8608.94] - - [4096, 3616, 1, 1024] - - [628, 9357.59] + - [700, 9357.59] - - [4096, 3955, 1, 1024] - - [627, 9589.71] + - [699, 9589.71] - - [1024, 3441, 1, 4096] - - [637, 8359.27] + - [709, 8359.27] - - [1024, 3236, 1, 4096] - - [651, 8022.6] + - [723, 8022.6] - - [1024, 3524, 1, 4096] - - [646, 8477.24] + - [718, 8477.24] - - [4096, 3460, 1, 1024] - - [626, 9581.96] + - [698, 9581.96] - - [16, 16, 3840, 64] - - [658, 1270.59] + - [730, 1270.59] - - [92, 93, 688, 64] - - [662, 4962.4] + - [734, 4962.4] - - [1024, 3384, 1, 4096] - - [637, 8409.39] + - [709, 8409.39] - - [4096, 3387, 1, 1024] - - [628, 9379.8] + - [700, 9379.8] - - [4096, 3436, 1, 1024] - - [626, 9491.93] + - [698, 9491.93] - - [4096, 3277, 1, 1024] - - [626, 9717.27] + - [698, 9717.27] - - [1024, 3457, 1, 4096] - - [646, 8279.22] + - [718, 8279.22] - - [1024, 3999, 1, 4096] - - [621, 9231.47] + - [693, 9231.47] - - [1024, 4032, 1, 4096] - - [630, 9443.62] + - [702, 9443.62] - - [4096, 3541, 1, 1024] - - [626, 9773.24] + - [698, 9773.24] - - [4096, 3334, 1, 1024] - - [626, 9242.79] + - [698, 9242.79] - - [1024, 3393, 1, 4096] - - [648, 8376.17] + - [720, 8376.17] - - [17, 17, 3632, 64] - - [670, 1425.77] + - [742, 1425.77] - - [1024, 3411, 1, 4096] - - [636, 8490.97] + - [708, 8490.97] - - [1024, 3822, 1, 1024] - - [633, 8773.44] + - [705, 8773.44] - - [1024, 3593, 1, 4096] - - [633, 8571.25] + - [705, 8571.25] - - [33708, 3822, 1, 1024] - - [627, 10056.8] + - [699, 10056.8] - - [4096, 3504, 1, 1024] - - [629, 9680.29] + - [701, 9680.29] - - [1024, 3163, 1, 4096] - - [648, 8014.43] + - [720, 8014.43] - - [1024, 3357, 1, 4096] - - [649, 8376.04] + - [721, 8376.04] - - [1024, 3906, 1, 4096] - - [630, 9108.22] + - [702, 9108.22] - - [4096, 3415, 1, 1024] - - [626, 9443.87] + - [698, 9443.87] - - [1024, 3406, 1, 4096] - - [649, 8451.64] + - [721, 8451.64] - - [4096, 3321, 1, 1024] - - [628, 9836.62] + - [700, 9836.62] - - [4096, 3584, 1, 1024] - - [629, 9915.93] + - [701, 9915.93] - - [1024, 2736, 1, 4096] - - [632, 8532.93] + - [704, 8532.93] - - [1024, 3110, 1, 4096] - - [649, 7889.29] + - [721, 7889.29] - - [33708, 3999, 1, 1024] - - [629, 9903.33] + - [701, 9903.33] - - [1024, 3093, 1, 4096] - - [647, 7919.35] + - [719, 7919.35] - - [4096, 3378, 1, 1024] - - [629, 9362.3] + - [701, 9362.3] - - [1024, 3543, 1, 4096] - - [643, 8438.16] + - [715, 8438.16] - - [33708, 3925, 1, 1024] - - [628, 10021.6] + - [700, 10021.6] - - [1024, 3352, 1, 4096] - - [649, 8333.82] + - [721, 8333.82] - - [4096, 3780, 1, 1024] - - [626, 9755.02] + - [698, 9755.02] - - [1024, 3990, 1, 4096] - - [623, 9251.02] + - [695, 9251.02] - - [4096, 3500, 1, 1024] - - [626, 9673.83] + - [698, 9673.83] - - [4096, 3996, 1, 1024] - - [627, 9694.5] + - [699, 9694.5] - - [1024, 3247, 1, 4096] - - [652, 8171.58] + - [724, 8171.58] - - [4096, 3395, 1, 1024] - - [628, 9392.04] + - [700, 9392.04] - - [1024, 3169, 1, 4096] - - [647, 7990.24] + - [719, 7990.24] - - [1024, 3088, 1, 4096] - - [647, 7890.36] + - [719, 7890.36] - - [1024, 3584, 1, 4096] - - [649, 8604.2] + - [721, 8604.2] - - [4096, 3093, 1, 1024] - - [628, 9224.88] + - [700, 9224.88] - - [1024, 3538, 1, 4096] - - [630, 8395.74] + - [702, 8395.74] - - [1024, 3996, 1, 1024] - - [631, 9208.33] + - [703, 9208.33] - - [1024, 3581, 1, 4096] - - [643, 8523.24] + - [715, 8523.24] - - [4096, 3374, 1, 1024] - - [628, 9342.81] + - [700, 9342.81] - - [33708, 3751, 1, 1024] - - [628, 9881.99] + - [700, 9881.99] - - [59, 59, 1088, 64] - - [666, 4515.54] + - [738, 4515.54] - - [4096, 3215, 1, 1024] - - [628, 9557.75] + - [700, 9557.75] - - [4096, 3312, 1, 1024] - - [626, 9834.4] + - [698, 9834.4] - - [4096, 3581, 1, 1024] - - [628, 9856.66] + - [700, 9856.66] - - [4096, 3479, 1, 1024] - - [628, 9620.35] + - [700, 9620.35] - - [4096, 3544, 1, 1024] - - [626, 9778.94] + - [698, 9778.94] - - [1024, 3870, 1, 1024] - - [631, 8935.26] + - [703, 8935.26] - - [1024, 3374, 1, 4096] - - [648, 8412.85] + - [720, 8412.85] - - [1024, 2967, 1, 4096] - - [631, 8982.97] + - [703, 8982.97] - - [41, 41, 1552, 64] - - [660, 2805.38] + - [732, 2805.38] - - [4096, 3455, 1, 1024] - - [626, 9538.89] + - [698, 9538.89] - - [4096, 3942, 1, 1024] - - [627, 9554.65] + - [699, 9554.65] - - [1024, 3528, 1, 4096] - - [646, 8438.47] + - [718, 8438.47] - - [4096, 3186, 1, 1024] - - [627, 9468.32] + - [699, 9468.32] - - [1024, 3976, 1, 1024] - - [631, 9167.08] + - [703, 9167.08] - - [1024, 3511, 1, 4096] - - [633, 8335.06] + - [705, 8335.06] - - [4096, 3573, 1, 1024] - - [626, 9855.33] + - [698, 9855.33] - - [4096, 3561, 1, 1024] - - [626, 9831.03] + - [698, 9831.03] - - [4096, 3418, 1, 1024] - - [627, 9450.68] + - [699, 9450.68] - - [33708, 3906, 1, 1024] - - [629, 9973.67] + - [701, 9973.67] - - [4096, 3259, 1, 1024] - - [626, 9685.26] + - [698, 9685.26] - - [4096, 3308, 1, 1024] - - [628, 9792.03] + - [700, 9792.03] - - [1024, 3419, 1, 4096] - - [648, 8514.53] + - [720, 8514.53] - - [1024, 3215, 1, 4096] - - [647, 8137.53] + - [719, 8137.53] - - [1024, 4030, 1, 4096] - - [629, 9290.76] + - [701, 9290.76] - - [4096, 3459, 1, 1024] - - [626, 9567.57] + - [698, 9567.57] - - [1024, 3572, 1, 4096] - - [646, 8501.43] + - [718, 8501.43] - - [1024, 3137, 1, 4096] - - [648, 7930.15] + - [720, 7930.15] - - [1024, 3312, 1, 4096] - - [649, 8378.6] + - [721, 8378.6] - - [1024, 3925, 1, 4096] - - [631, 9255.86] + - [703, 9255.86] - - [1024, 3453, 1, 4096] - - [648, 8630.76] + - [720, 8630.76] - - [4096, 3435, 1, 1024] - - [627, 9495.18] + - [699, 9495.18] - - [1024, 3176, 1, 4096] - - [648, 8087.23] + - [720, 8087.23] - - [1024, 3444, 1, 4096] - - [640, 8528.58] + - [712, 8528.58] - - [4096, 3975, 1, 1024] - - [629, 9645.34] + - [701, 9645.34] - - [4096, 3182, 1, 1024] - - [628, 9448.4] + - [700, 9448.4] - - [1024, 3475, 1, 4096] - - [647, 8404.87] + - [719, 8404.87] - - [9, 9, 6544, 64] - - [662, 425.854] + - [734, 425.854] - - [33708, 3955, 1, 1024] - - [629, 10088.4] + - [701, 10088.4] - - [4096, 3446, 1, 1024] - - [628, 9520.06] + - [700, 9520.06] - - [1024, 3138, 1, 4096] - - [647, 8053.44] + - [719, 8053.44] - - [1024, 3549, 1, 4096] - - [633, 8426.42] + - [705, 8426.42] - - [4096, 3287, 1, 1024] - - [629, 9751.34] + - [701, 9751.34] - - [1024, 3342, 1, 4096] - - [646, 8320.01] + - [718, 8320.01] - - [102, 102, 624, 64] - - [661, 4747.52] + - [733, 4747.52] - - [4096, 3519, 1, 1024] - - [628, 9716.1] + - [700, 9716.1] - - [4096, 3552, 1, 1024] - - [626, 9806.69] + - [698, 9806.69] - - [4096, 3859, 1, 1024] - - [626, 9369.94] + - [698, 9369.94] - - [33708, 3969, 1, 1024] - - [626, 9830.39] + - [698, 9830.39] - - [1024, 3369, 1, 4096] - - [647, 8379.26] + - [719, 8379.26] - - [4096, 3482, 1, 1024] - - [626, 9631.7] + - [698, 9631.7] - - [1024, 3306, 1, 4096] - - [649, 8320.06] + - [721, 8320.06] - - [1024, 3474, 1, 4096] - - [648, 8498.9] + - [720, 8498.9] - - [99, 99, 624, 64] - - [660, 4492.9] + - [732, 4492.9] - - [4096, 3377, 1, 1024] - - [626, 9369.92] + - [698, 9369.92] - - [4096, 3426, 1, 1024] - - [626, 9467.3] + - [698, 9467.3] - - [4096, 2935, 1, 1024] - - [627, 9423.74] + - [699, 9423.74] - - [4096, 3267, 1, 1024] - - [626, 9698.04] + - [698, 9698.04] - - [1024, 3299, 1, 4096] - - [647, 8264.76] + - [719, 8264.76] - - [1024, 3456, 1, 4096] - - [646, 8678.39] + - [718, 8678.39] - - [1024, 3280, 1, 4096] - - [647, 8220.69] + - [719, 8220.69] - - [1024, 3555, 1, 4096] - - [646, 8656.27] + - [718, 8656.27] - - [4096, 3499, 1, 1024] - - [628, 9663.93] + - [700, 9663.93] - - [4096, 3356, 1, 1024] - - [628, 9296.9] + - [700, 9296.9] - - [100, 102, 624, 64] - - [661, 4671.51] + - [733, 4671.51] - - [1024, 3412, 1, 4096] - - [649, 8538.05] + - [721, 8538.05] - - [1024, 2984, 1, 4096] - - [632, 9193.17] + - [704, 9193.17] - - [4096, 3141, 1, 1024] - - [628, 9349.43] + - [700, 9349.43] - - [4096, 3510, 1, 1024] - - [626, 9701.98] + - [698, 9701.98] - - [1024, 3995, 1, 1024] - - [630, 9243.4] + - [702, 9243.4] - - [1024, 3517, 1, 4096] - - [648, 8569.31] + - [720, 8569.31] - - [1024, 3455, 1, 4096] - - [648, 8560.67] + - [720, 8560.67] - - [1024, 3939, 1, 1024] - - [631, 9030.94] + - [703, 9030.94] - - [38, 38, 1680, 64] - - [660, 2459.84] + - [732, 2459.84] - - [1024, 3447, 1, 4096] - - [646, 8610.02] + - [718, 8610.02] - - [1024, 3969, 1, 4096] - - [633, 9097.33] + - [705, 9097.33] - - [4096, 3527, 1, 1024] - - [628, 9743.83] + - [700, 9743.83] - - [4096, 3336, 1, 1024] - - [628, 9248.33] + - [700, 9248.33] - - [1024, 3191, 1, 4096] - - [646, 8104.96] + - [718, 8104.96] - - [1024, 3302, 1, 4096] - - [647, 8245.09] + - [719, 8245.09] - - [1024, 3337, 1, 4096] - - [649, 8254.25] + - [721, 8254.25] - - [4096, 3290, 1, 1024] - - [628, 9759.13] + - [700, 9759.13] - - [1024, 3512, 1, 4096] - - [637, 8641.06] + - [709, 8641.06] - - [1024, 3433, 1, 4096] - - [647, 8444.7] + - [719, 8444.7] - - [4096, 3876, 1, 1024] - - [627, 9420.38] + - [699, 9420.38] - - [4096, 3490, 1, 1024] - - [628, 9641.11] + - [700, 9641.11] - - [4096, 3064, 1, 1024] - - [628, 9820.49] + - [700, 9820.49] - - [1024, 3508, 1, 4096] - - [643, 8442.24] + - [715, 8442.24] - - [1024, 3956, 1, 4096] - - [628, 9128.19] + - [700, 9128.19] - - [4096, 3417, 1, 1024] - - [628, 9448.41] + - [700, 9448.41] - - [1024, 3248, 1, 4096] - - [647, 8006.16] + - [719, 8006.16] - - [1024, 2499, 1, 4096] - - [647, 8155.19] + - [719, 8155.19] - - [1024, 3186, 1, 4096] - - [647, 8093.04] + - [719, 8093.04] - - [1024, 3180, 1, 4096] - - [649, 8097.02] + - [721, 8097.02] - - [4096, 3364, 1, 1024] - - [628, 9318.08] + - [700, 9318.08] - - [4096, 3976, 1, 1024] - - [628, 9654.47] + - [700, 9654.47] - - [4096, 3205, 1, 1024] - - [629, 9538.84] + - [701, 9538.84] - - [4096, 3318, 1, 1024] - - [626, 9838.29] + - [698, 9838.29] - - [1024, 3377, 1, 4096] - - [649, 8445.64] + - [721, 8445.64] - - [1024, 3485, 1, 4096] - - [646, 8368.83] + - [718, 8368.83] - - [4096, 3181, 1, 1024] - - [629, 9458.29] + - [701, 9458.29] - - [4096, 3550, 1, 1024] - - [626, 9783.14] + - [698, 9783.14] - - [1024, 3534, 1, 4096] - - [635, 8684.99] + - [707, 8684.99] - - [1024, 3860, 1, 1024] - - [630, 8923.18] + - [702, 8923.18] - - [160, 160, 400, 64] - - [673, 5797.69] + - [745, 5797.69] - - [4096, 3445, 1, 1024] - - [628, 9511.28] + - [700, 9511.28] - - [1024, 3391, 1, 4096] - - [649, 8541.77] + - [721, 8541.77] - - [1024, 3221, 1, 4096] - - [647, 8055.5] + - [719, 8055.5] - - [4096, 3079, 1, 1024] - - [626, 9181.04] + - [698, 9181.04] - - [4096, 3144, 1, 1024] - - [628, 9351.45] + - [700, 9351.45] - - [1024, 3270, 1, 4096] - - [648, 8367.63] + - [720, 8367.63] - - [1024, 3561, 1, 4096] - - [648, 8426.29] + - [720, 8426.29] - - [1024, 3480, 1, 4096] - - [635, 8465.0] + - [707, 8465.0] - - [4096, 3408, 1, 1024] - - [628, 9420.04] + - [700, 9420.04] - - [1024, 3418, 1, 4096] - - [649, 8481.02] + - [721, 8481.02] - - [4096, 3298, 1, 1024] - - [629, 9788.4] + - [701, 9788.4] - - [1024, 3640, 1, 1024] - - [632, 8435.44] + - [704, 8435.44] - - [1024, 3449, 1, 4096] - - [647, 8590.87] + - [719, 8590.87] - - [1024, 4020, 1, 4096] - - [625, 9168.13] + - [697, 9168.13] - - [4096, 3481, 1, 1024] - - [626, 9627.91] + - [698, 9627.91] - - [4096, 3530, 1, 1024] - - [628, 9734.68] + - [700, 9734.68] - - [1024, 3216, 1, 4096] - - [649, 8014.32] + - [721, 8014.32] - - [1024, 3840, 1, 1024] - - [632, 8908.37] + - [704, 8908.37] - - [1024, 3491, 1, 4096] - - [635, 8410.59] + - [707, 8410.59] - - [1024, 3154, 1, 4096] - - [648, 8095.69] + - [720, 8095.69] - - [4096, 3425, 1, 1024] - - [628, 9474.53] + - [700, 9474.53] - - [1024, 3348, 1, 4096] - - [646, 8202.9] + - [718, 8202.9] - - [1024, 3415, 1, 4096] - - [647, 8597.68] + - [719, 8597.68] - - [1024, 4026, 1, 1024] - - [630, 9279.09] + - [702, 9279.09] - - [1024, 3367, 1, 4096] - - [649, 8335.54] + - [721, 8335.54] - - [1024, 3259, 1, 4096] - - [649, 8285.3] + - [721, 8285.3] - - [1024, 3894, 1, 4096] - - [632, 9040.44] + - [704, 9040.44] - - [4096, 3355, 1, 1024] - - [627, 9291.67] + - [699, 9291.67] - - [4096, 3404, 1, 1024] - - [628, 9410.47] + - [700, 9410.47] - - [1024, 3308, 1, 4096] - - [649, 8336.3] + - [721, 8336.3] - - [4096, 3245, 1, 1024] - - [627, 9641.47] + - [699, 9641.47] - - [1024, 3502, 1, 4096] - - [648, 8375.9] + - [720, 8375.9] - - [33708, 4032, 1, 1024] - - [627, 9988.2] + - [699, 9988.2] - - [8, 8, 7280, 64] - - [664, 339.878] + - [736, 339.878] - - [1024, 3424, 1, 4096] - - [635, 8489.48] + - [707, 8489.48] - - [4096, 3509, 1, 1024] - - [627, 9702.29] + - [699, 9702.29] - - [4096, 3558, 1, 1024] - - [628, 9815.51] + - [700, 9815.51] - - [1024, 3900, 1, 1024] - - [631, 9014.05] + - [703, 9014.05] - - [1024, 2505, 1, 4096] - - [645, 8263.75] + - [717, 8263.75] - - [4096, 3472, 1, 1024] - - [626, 9609.61] + - [698, 9609.61] - - [1024, 3386, 1, 4096] - - [646, 8417.55] + - [718, 8417.55] - - [4096, 3383, 1, 1024] - - [628, 9364.77] + - [700, 9364.77] - - [4096, 3448, 1, 1024] - - [629, 9521.07] + - [701, 9521.07] - - [4096, 4030, 1, 1024] - - [629, 9771.56] + - [701, 9771.56] - - [4096, 3289, 1, 1024] - - [626, 9757.27] + - [698, 9757.27] - - [1024, 3459, 1, 4096] - - [648, 8422.12] + - [720, 8422.12] - - [1024, 2918, 1, 4096] - - [633, 9022.71] + - [705, 9022.71] - - [4096, 3489, 1, 1024] - - [626, 9641.9] + - [698, 9641.9] - - [4096, 3346, 1, 1024] - - [628, 9271.65] + - [700, 9271.65] - - [4096, 3572, 1, 1024] - - [628, 9829.82] + - [700, 9829.82] - - [1024, 3955, 1, 4096] - - [629, 9221.66] + - [701, 9221.66] - - [4096, 3236, 1, 1024] - - [626, 9620.72] + - [698, 9620.72] - - [4096, 3163, 1, 1024] - - [626, 9397.3] + - [698, 9397.3] - - [4096, 3468, 1, 1024] - - [626, 9601.58] + - [698, 9601.58] - - [1024, 3165, 1, 4096] - - [648, 7941.58] + - [720, 7941.58] - - [1024, 3276, 1, 4096] - - [648, 8244.96] + - [720, 8244.96] - - [1024, 3359, 1, 4096] - - [646, 8273.93] + - [718, 8273.93] - - [4096, 3363, 1, 1024] - - [628, 9315.8] + - [700, 9315.8] - - [1024, 3385, 1, 4096] - - [640, 8286.2] + - [712, 8286.2] - - [1024, 3207, 1, 4096] - - [649, 8144.02] + - [721, 8144.02] - - [1024, 3458, 1, 4096] - - [648, 8472.41] + - [720, 8472.41] - - [21, 21, 2976, 64] - - [664, 2083.3] + - [736, 2083.3] - - [4096, 3110, 1, 1024] - - [626, 9260.3] + - [698, 9260.3] - - [4096, 3925, 1, 1024] - - [629, 9526.66] + - [701, 9526.66] - - [1024, 3975, 1, 4096] - - [624, 9133.84] + - [696, 9133.84] - - [4096, 3549, 1, 1024] - - [628, 9793.77] + - [700, 9793.77] - - [4096, 3342, 1, 1024] - - [627, 9264.48] + - [699, 9264.48] - - [1024, 3859, 1, 1024] - - [630, 8933.47] + - [702, 8933.47] - - [1024, 3497, 1, 4096] - - [647, 8526.13] + - [719, 8526.13] - - [4096, 3280, 1, 1024] - - [628, 9733.32] + - [700, 9733.32] - - [1024, 3435, 1, 4096] - - [647, 8489.85] + - [719, 8489.85] - - [1024, 3354, 1, 4096] - - [647, 8248.83] + - [719, 8248.83] - - [4096, 3191, 1, 1024] - - [627, 9475.12] + - [699, 9475.12] - - [4096, 3512, 1, 1024] - - [626, 9701.37] + - [698, 9701.37] - - [1024, 3055, 1, 4096] - - [633, 9264.91] + - [705, 9264.91] - - [4096, 2499, 1, 1024] - - [628, 9574.06] + - [700, 9574.06] - - [1024, 3233, 1, 4096] - - [646, 8101.74] + - [718, 8101.74] - - [4096, 3423, 1, 1024] - - [629, 9463.5] + - [701, 9463.5] - - [1024, 3319, 1, 4096] - - [649, 8413.76] + - [721, 8413.76] - - [4096, 3297, 1, 1024] - - [626, 9782.66] + - [698, 9782.66] - - [4096, 3154, 1, 1024] - - [628, 9381.2] + - [700, 9381.2] - - [1024, 3540, 1, 4096] - - [649, 8507.53] + - [721, 8507.53] - - [1024, 3289, 1, 4096] - - [649, 8233.8] + - [721, 8233.8] - - [4096, 3529, 1, 1024] - - [628, 9741.15] + - [700, 9741.15] - - [4096, 3386, 1, 1024] - - [628, 9372.57] + - [700, 9372.57] - - [4096, 3276, 1, 1024] - - [626, 9713.76] + - [698, 9713.76] - - [1024, 3244, 1, 4096] - - [649, 8146.83] + - [721, 8146.83] - - [1024, 3182, 1, 4096] - - [646, 8115.12] + - [718, 8115.12] - - [4096, 3540, 1, 1024] - - [626, 9768.42] + - [698, 9768.42] - - [1024, 3360, 1, 4096] - - [648, 8353.31] + - [720, 8353.31] - - [1024, 3942, 1, 4096] - - [627, 9143.78] + - [699, 9143.78] - - [4096, 3403, 1, 1024] - - [629, 9412.18] + - [701, 9412.18] - - [4096, 3101, 1, 1024] - - [629, 9239.28] + - [701, 9239.28] - - [4096, 2918, 1, 1024] - - [628, 9373.75] + - [700, 9373.75] - - [1024, 3465, 1, 4096] - - [649, 8288.16] + - [721, 8288.16] - - [33708, 3780, 1, 1024] - - [628, 9971.91] + - [700, 9971.91] - - [4096, 3557, 1, 1024] - - [626, 9814.82] + - [698, 9814.82] - - [4096, 3414, 1, 1024] - - [626, 9436.63] + - [698, 9436.63] - - [1024, 3948, 1, 1024] - - [630, 9073.8] + - [702, 9073.8] - - [4096, 3320, 1, 1024] - - [628, 9834.77] + - [700, 9834.77] - - [4096, 2765, 1, 1024] - - [628, 9667.06] + - [700, 9667.06] - - [1024, 3978, 1, 4096] - - [623, 9109.6] + - [695, 9109.6] - - [4096, 3487, 1, 1024] - - [626, 9644.0] + - [698, 9644.0] - - [4096, 3520, 1, 1024] - - [628, 9728.08] + - [700, 9728.08] - - [1024, 3139, 1, 4096] - - [648, 7940.19] + - [720, 7940.19] - - [1024, 3314, 1, 4096] - - [646, 8294.01] + - [718, 8294.01] - - [4096, 3431, 1, 1024] - - [628, 9482.12] + - [700, 9482.12] - - [123, 122, 528, 64] - - [661, 6325.98] + - [733, 6325.98] - - [1024, 3446, 1, 4096] - - [642, 8468.34] + - [714, 8468.34] - - [1024, 4059, 1, 4096] - - [629, 9370.8] + - [701, 9370.8] - - [99, 102, 624, 64] - - [661, 4624.8] + - [733, 4624.8] - - [4096, 3345, 1, 1024] - - [626, 9271.32] + - [698, 9271.32] - - [4096, 3394, 1, 1024] - - [626, 9398.19] + - [698, 9398.19] - - [1024, 3927, 1, 1024] - - [631, 9041.38] + - [703, 9041.38] - - [4096, 3235, 1, 1024] - - [626, 9619.93] + - [698, 9619.93] - - [1024, 3328, 1, 4096] - - [647, 8406.09] + - [719, 8406.09] - - [33708, 3956, 1, 1024] - - [627, 10100.4] + - [699, 10100.4] - - [4096, 3467, 1, 1024] - - [628, 9586.66] + - [700, 9586.66] - - [1024, 3287, 1, 4096] - - [648, 8273.83] + - [720, 8273.83] - - [4096, 3214, 1, 1024] - - [629, 9557.49] + - [701, 9557.49] - - [4096, 3910, 1, 1024] - - [626, 9490.25] + - [698, 9490.25] - - [1024, 3780, 1, 1024] - - [633, 8706.0] + - [705, 8706.0] - - [1024, 3371, 1, 4096] - - [649, 8248.46] + - [721, 8248.46] - - [4096, 3478, 1, 1024] - - [629, 9619.62] + - [701, 9619.62] - - [1024, 3546, 1, 4096] - - [647, 8456.83] + - [719, 8456.83] - - [1024, 4012, 1, 1024] - - [630, 9253.34] + - [702, 9253.34] - - [4096, 3341, 1, 1024] - - [628, 9260.24] + - [700, 9260.24] - - [4096, 3454, 1, 1024] - - [626, 9533.62] + - [698, 9533.62] - - [4096, 3295, 1, 1024] - - [629, 9772.86] + - [701, 9772.86] - - [4096, 3072, 1, 1024] - - [626, 9887.23] + - [698, 9887.23] - - [1024, 3282, 1, 4096] - - [634, 8112.85] + - [706, 8112.85] - - [33708, 3720, 1, 1024] - - [629, 9818.85] + - [701, 9818.85] - - [1024, 3681, 1, 4096] - - [631, 8639.28] + - [703, 8639.28] - - [1024, 4050, 1, 4096] - - [629, 9291.93] + - [701, 9291.93] - - [4096, 3495, 1, 1024] - - [628, 9660.52] + - [700, 9660.52] - - [4096, 3560, 1, 1024] - - [627, 9813.8] + - [699, 9813.8] - - [4096, 3751, 1, 1024] - - [626, 9684.95] + - [698, 9684.95] - - [1024, 3414, 1, 4096] - - [647, 8555.72] + - [719, 8555.72] - - [33708, 3860, 1, 1024] - - [626, 9856.68] + - [698, 9856.68] - - [1024, 3325, 1, 4096] - - [636, 8261.21] + - [708, 8261.21] - - [4096, 3458, 1, 1024] - - [626, 9570.86] + - [698, 9570.86] - - [4096, 2967, 1, 1024] - - [626, 9544.61] + - [698, 9544.61] - - [1024, 3519, 1, 4096] - - [649, 8413.1] + - [721, 8413.1] - - [4096, 3385, 1, 1024] - - [628, 9367.34] + - [700, 9367.34] - - [4096, 3434, 1, 1024] - - [626, 9488.41] + - [698, 9488.41] - - [1024, 3552, 1, 4096] - - [647, 8456.13] + - [719, 8456.13] - - [4096, 3822, 1, 1024] - - [627, 9849.84] + - [699, 9849.84] - - [1024, 3544, 1, 4096] - - [646, 8494.56] + - [718, 8494.56] - - [4096, 3539, 1, 1024] - - [628, 9763.09] + - [700, 9763.09] - - [4096, 3332, 1, 1024] - - [626, 9232.36] + - [698, 9232.36] - - [1024, 3145, 1, 4096] - - [646, 8098.36] + - [718, 8098.36] - - [1024, 3535, 1, 4096] - - [634, 8592.8] + - [706, 8592.8] - - [1024, 3320, 1, 4096] - - [647, 8419.55] + - [719, 8419.55] - - [33708, 4012, 1, 1024] - - [629, 9940.2] + - [701, 9940.2] - - [4096, 3286, 1, 1024] - - [628, 9747.82] + - [700, 9747.82] - - [1024, 3514, 1, 4096] - - [647, 8653.69] + - [719, 8653.69] - - [93, 93, 688, 64] - - [668, 5005.79] + - [740, 5005.79] - - [1024, 2765, 1, 4096] - - [633, 8636.72] + - [705, 8636.72] - - [1024, 3452, 1, 4096] - - [646, 8445.87] + - [718, 8445.87] - - [4096, 3518, 1, 1024] - - [626, 9722.56] + - [698, 9722.56] - - [1024, 3529, 1, 4096] - - [646, 8444.32] + - [718, 8444.32] - - [4096, 3413, 1, 1024] - - [626, 9436.35] + - [698, 9436.35] - - [33708, 4050, 1, 1024] - - [628, 10026.7] + - [700, 10026.7] - - [1024, 3525, 1, 4096] - - [639, 8488.99] + - [711, 8488.99] - - [4096, 3303, 1, 1024] - - [626, 9791.05] + - [698, 9791.05] - - [1024, 3382, 1, 4096] - - [647, 8483.63] + - [719, 8483.63] - - [1024, 3390, 1, 4096] - - [646, 8552.81] + - [718, 8552.81] - - [1024, 3977, 1, 4096] - - [628, 9053.53] + - [700, 9053.53] - - [1024, 3184, 1, 4096] - - [646, 8008.81] + - [718, 8008.81] - - [4096, 3535, 1, 1024] - - [628, 9760.79] + - [700, 9760.79] - - [4096, 3376, 1, 1024] - - [629, 9341.93] + - [701, 9341.93] - - [4096, 3978, 1, 1024] - - [629, 9642.8] + - [701, 9642.8] - - [1024, 3136, 1, 4096] - - [648, 8085.12] + - [720, 8085.12] - - [1024, 3293, 1, 4096] - - [646, 8300.49] + - [718, 8300.49] - - [4096, 3266, 1, 1024] - - [627, 9691.78] + - [699, 9691.78] - - [1024, 3487, 1, 4096] - - [646, 8383.62] + - [718, 8383.62] - - [1024, 3409, 1, 4096] - - [648, 8493.25] + - [720, 8493.25] - - [4096, 3498, 1, 1024] - - [627, 9672.38] + - [699, 9672.38] - - [1024, 3520, 1, 4096] - - [649, 8488.26] + - [721, 8488.26] - - [1024, 3530, 1, 4096] - - [630, 8409.87] + - [702, 8409.87] - - [4096, 3393, 1, 1024] - - [628, 9395.43] + - [700, 9395.43] - - [4096, 3140, 1, 1024] - - [628, 9338.5] + - [700, 9338.5] - - [1024, 3536, 1, 4096] - - [649, 8642.11] + - [721, 8642.11] - - [1024, 3288, 1, 4096] - - [649, 8229.34] + - [721, 8229.34] - - [1024, 4005, 1, 4096] - - [631, 9271.04] + - [703, 9271.04] - - [1024, 3579, 1, 4096] - - [635, 8844.5] + - [707, 8844.5] - - [4096, 3372, 1, 1024] - - [626, 9339.25] + - [698, 9339.25] - - [1024, 3440, 1, 4096] - - [646, 8466.69] + - [718, 8466.69] - - [4096, 3213, 1, 1024] - - [629, 9558.85] + - [701, 9558.85] - - [123, 123, 528, 64] - - [661, 6333.59] + - [733, 6333.59] - - [100, 100, 624, 64] - - [660, 4584.12] + - [732, 4584.12] - - [1024, 3968, 1, 4096] - - [627, 9237.6] + - [699, 9237.6] - - [4096, 3477, 1, 1024] - - [627, 9618.88] + - [699, 9618.88] - - [4096, 3526, 1, 1024] - - [626, 9735.94] + - [698, 9735.94] - - [1024, 3493, 1, 4096] - - [647, 8355.13] + - [719, 8355.13] - - [1024, 3944, 1, 4096] - - [622, 9065.39] + - [694, 9065.39] - - [4096, 3453, 1, 1024] - - [627, 9533.37] + - [699, 9533.37] - - [1024, 3350, 1, 4096] - - [649, 8448.64] + - [721, 8448.64] - - [4096, 3184, 1, 1024] - - [628, 9447.38] + - [700, 9447.38] - - [1024, 3423, 1, 4096] - - [647, 8465.38] + - [719, 8465.38] - - [4096, 3351, 1, 1024] - - [626, 9282.06] + - [698, 9282.06] - - [4096, 3416, 1, 1024] - - [626, 9446.64] + - [698, 9446.64] - - [1024, 3796, 1, 4096] - - [628, 8820.34] + - [700, 8820.34] - - [4096, 3257, 1, 1024] - - [626, 9671.64] + - [698, 9671.64] - - [4096, 3306, 1, 1024] - - [628, 9795.51] + - [700, 9795.51] - - [33708, 4020, 1, 1024] - - [628, 9961.85] + - [700, 9961.85] - - [19, 19, 3264, 64] - - [658, 1736.09] + - [730, 1736.09] - - [1024, 3426, 1, 4096] - - [646, 8518.61] + - [718, 8518.61] - - [4096, 3457, 1, 1024] - - [626, 9564.56] + - [698, 9564.56] - - [1024, 2935, 1, 4096] - - [631, 9067.79] + - [703, 9067.79] - - [1024, 3046, 1, 4096] - - [631, 9242.97] + - [703, 9242.97] - - [4096, 3433, 1, 1024] - - [628, 9495.65] + - [700, 9495.65] - - [1024, 3256, 1, 4096] - - [649, 8224.23] + - [721, 8224.23] - - [1024, 3531, 1, 4096] - - [646, 8524.19] + - [718, 8524.19] - - [4096, 3180, 1, 1024] - - [626, 9443.53] + - [698, 9443.53] - - [1024, 3388, 1, 4096] - - [648, 8352.82] + - [720, 8352.82] - - [4096, 3444, 1, 1024] - - [629, 9511.03] + - [701, 9511.03] - - [1024, 3501, 1, 4096] - - [636, 8461.12] + - [708, 8461.12] - - [1024, 3266, 1, 4096] - - [634, 8147.44] + - [706, 8147.44] - - [1024, 3267, 1, 4096] - - [649, 8391.49] + - [721, 8391.49] - - [1024, 3461, 1, 4096] - - [633, 8270.29] + - [705, 8270.29] - - [4096, 3870, 1, 1024] - - [628, 9399.69] + - [700, 9399.69] - - [4096, 3517, 1, 1024] - - [626, 9725.43] + - [698, 9725.43] - - [1024, 3566, 1, 4096] - - [649, 8669.76] + - [721, 8669.76] - - [4096, 3574, 1, 1024] - - [626, 9844.63] + - [698, 9844.63] - - [1024, 3876, 1, 1024] - - [631, 8961.74] + - [703, 8961.74] - - [25, 25, 2512, 64] - - [657, 2472.54] + - [729, 2472.54] - - [4096, 3720, 1, 1024] - - [626, 9612.49] + - [698, 9612.49] - - [4096, 3248, 1, 1024] - - [628, 9644.92] + - [700, 9644.92] - - [4096, 4059, 1, 1024] - - [626, 9826.42] + - [698, 9826.42] - - [1024, 3380, 1, 4096] - - [647, 8677.91] + - [719, 8677.91] - - [4096, 3480, 1, 1024] - - [628, 9626.16] + - [700, 9626.16] - - [1024, 3335, 1, 4096] - - [648, 8302.18] + - [720, 8302.18] - - [1024, 3345, 1, 4096] - - [648, 8323.13] + - [720, 8323.13] - - [4096, 3391, 1, 1024] - - [626, 9379.48] + - [698, 9379.48] - - [4096, 3424, 1, 1024] - - [628, 9466.77] + - [700, 9466.77] - - [1024, 3394, 1, 4096] - - [634, 8373.91] + - [706, 8373.91] - - [4096, 3265, 1, 1024] - - [628, 9700.89] + - [700, 9700.89] - - [1024, 3014, 1, 4096] - - [631, 9303.09] + - [703, 9303.09] - - [4096, 3497, 1, 1024] - - [626, 9668.6] + - [698, 9668.6] - - [4096, 3354, 1, 1024] - - [628, 9294.31] + - [700, 9294.31] - - [4096, 3055, 1, 1024] - - [627, 9780.88] + - [699, 9780.88] - - [1024, 3499, 1, 4096] - - [640, 8527.04] + - [712, 8527.04] - - [1024, 3162, 1, 4096] - - [648, 8059.02] + - [720, 8059.02] - - [4096, 3244, 1, 1024] - - [628, 9636.86] + - [700, 9636.86] - - [1024, 3437, 1, 4096] - - [647, 8583.41] + - [719, 8583.41] - - [1024, 3356, 1, 4096] - - [649, 8296.95] + - [721, 8296.95] - - [4096, 3139, 1, 1024] - - [628, 9338.7] + - [700, 9338.7] - - [4096, 3508, 1, 1024] - - [628, 9700.54] + - [700, 9700.54] - - [1024, 3235, 1, 4096] - - [646, 8314.59] + - [718, 8314.59] - - [1024, 3910, 1, 4096] - - [633, 9200.21] + - [705, 9200.21] - - [4096, 3371, 1, 1024] - - [626, 9336.97] + - [698, 9336.97] - - [1024, 3751, 1, 4096] - - [633, 8827.67] + - [705, 8827.67] - - [4096, 3325, 1, 1024] - - [626, 9845.68] + - [698, 9845.68] - - [1024, 3413, 1, 4096] - - [634, 8345.78] + - [706, 8345.78] - - [1024, 3542, 1, 4096] - - [646, 8521.71] + - [718, 8521.71] - - [18, 18, 3440, 64] - - [662, 1578.24] + - [734, 1578.24] - - [101, 102, 624, 64] - - [660, 4705.28] + - [732, 4705.28] - - [33708, 3900, 1, 1024] - - [626, 9951.05] + - [698, 9951.05] - - [4096, 3525, 1, 1024] - - [627, 9744.47] + - [699, 9744.47] - - [4096, 3382, 1, 1024] - - [627, 9359.03] + - [699, 9359.03] - - [102, 100, 624, 64] - - [661, 4671.51] + - [733, 4671.51] - - [15, 15, 4096, 64] - - [665, 1129.17] + - [737, 1129.17] - - [1024, 3339, 1, 4096] - - [635, 8326.37] + - [707, 8326.37] - - [4096, 3288, 1, 1024] - - [628, 9761.48] + - [700, 9761.48] - - [92, 92, 688, 64] - - [668, 4903.87] + - [740, 4903.87] - - [1024, 3141, 1, 4096] - - [646, 7975.64] + - [718, 7975.64] - - [1024, 3168, 1, 4096] - - [646, 8083.74] + - [718, 8083.74] - - [4096, 3488, 1, 1024] - - [628, 9646.77] + - [700, 9646.77] - - [4096, 3046, 1, 1024] - - [627, 9767.58] + - [699, 9767.58] - - [1024, 3362, 1, 4096] - - [649, 8458.15] + - [721, 8458.15] - - [33708, 3942, 1, 1024] - - [627, 10060.4] + - [699, 10060.4] - - [4096, 3399, 1, 1024] - - [628, 9406.57] + - [700, 9406.57] - - [1024, 3720, 1, 1024] - - [630, 8639.16] + - [702, 8639.16] - - [4096, 3563, 1, 1024] - - [626, 9836.55] + - [698, 9836.55] - - [1024, 3273, 1, 4096] - - [649, 8221.62] + - [721, 8221.62] - - [4096, 3162, 1, 1024] - - [628, 9400.19] + - [700, 9400.19] - - [1024, 3467, 1, 4096] - - [647, 8342.42] + - [719, 8342.42] - - [1024, 3130, 1, 4096] - - [648, 7933.88] + - [720, 7933.88] - - [1024, 3405, 1, 4096] - - [655, 8406.59] + - [727, 8406.59] - - [4096, 3362, 1, 1024] - - [626, 9312.04] + - [698, 9312.04] - - [1024, 3960, 1, 1024] - - [630, 9082.26] + - [702, 9082.26] - - [2048, 128, 1, 4096] - - [680, 5986.62] + - [752, 5986.62] - - [1024, 3712, 1, 36548] - - [678, 9456.25] + - [750, 9456.25] - - [1024, 128, 1, 1024] - - [681, 3631.53] + - [753, 3631.53] - - [3072, 128, 1, 4096] - - [677, 6145.6] + - [749, 6145.6] - - [1024, 3712, 1, 1024] - - [679, 8933.98] + - [751, 8933.98] + - - [256, 256, 192, 64] + - [756, 8264.74] + - - [768, 4096, 1, 768] + - [769, 9642.18] + - - [768, 64, 1, 768] + - [766, 1850.53] + - - [768, 1280, 1, 768] + - [769, 8738.23] + - - [30522, 320, 1, 768] + - [770, 9733.69] + - - [128, 128, 96, 64] + - [759, 5470.93] + - - [2, 16, 1, 768] + - [762, 2.57742] + - - [30522, 1280, 1, 768] + - [768, 10128.0] + - - [30522, 640, 1, 768] + - [769, 9987.71] + - - [2, 8, 1, 768] + - [761, 1.06] + - - [768, 4096, 1, 3072] + - [771, 9479.51] + - - [768, 32, 1, 768] + - [765, 880.434] + - - [2, 64, 1, 768] + - [762, 10.09024] + - - [256, 256, 96, 64] + - [756, 7614.57] + - - [64, 64, 768, 64] + - [758, 5354.53] + - - [30522, 160, 1, 768] + - [767, 7740.21] + - - [768, 320, 1, 768] + - [760, 5423.77] + - - [128, 128, 384, 64] + - [757, 7180.08] + - - [768, 16, 1, 768] + - [763, 706.476] + - - [3072, 4096, 1, 768] + - [772, 9961.84] + - - [2048, 512, 1, 100] + - [774, 5180.81] + - - [1024, 200, 1, 560] + - [775, 4061.29] + - - [256, 1280, 1, 1024] + - [782, 4337.54] + - - [256, 44505, 1, 1024] + - [818, 8597.79] + - - [10240, 8976, 1, 256] + - [821, 9471.53] + - - [256, 7168, 1, 1024] + - [812, 6718.66] + - - [8448, 8976, 1, 256] + - [804, 9601.41] + - - [18944, 8976, 1, 256] + - [813, 9666.36] + - - [256, 19200, 1, 1024] + - [789, 7489.04] + - - [5632, 8976, 1, 256] + - [801, 9358.49] + - - [256, 23552, 1, 1024] + - [816, 7980.99] + - - [256, 6656, 1, 1024] + - [816, 6287.32] + - - [256, 14336, 1, 1024] + - [811, 7049.36] + - - [256, 12544, 1, 1024] + - [789, 6728.57] + - - [2048, 684, 1, 768] + - [806, 8479.28] + - - [5376, 8976, 1, 256] + - [801, 9519.61] + - - [256, 5888, 1, 1024] + - [821, 6012.5] + - - [19968, 8976, 1, 256] + - [813, 9684.77] + - - [3840, 8976, 1, 256] + - [798, 9461.99] + - - [4608, 8976, 1, 256] + - [798, 9305.92] + - - [256, 684, 1, 1024] + - [824, 3513.16] + - - [256, 22016, 1, 1024] + - [789, 7643.89] + - - [256, 23296, 1, 1024] + - [818, 8048.22] + - - [4864, 8976, 1, 256] + - [796, 9545.72] + - - [256, 7424, 1, 1024] + - [814, 6770.75] + - - [18176, 8976, 1, 256] + - [821, 9729.57] + - - [256, 15104, 1, 1024] + - [810, 7289.18] + - - [8192, 8976, 1, 256] + - [813, 9395.59] + - - [256, 16128, 1, 1024] + - [813, 7461.38] + - - [13312, 8976, 1, 256] + - [821, 9551.07] + - - [256, 21504, 1, 1024] + - [818, 7636.03] + - - [6400, 8976, 1, 256] + - [805, 9561.06] + - - [256, 8960, 1, 1024] + - [780, 6292.46] + - - [1792, 8976, 1, 256] + - [795, 9372.28] + - - [13824, 8976, 1, 256] + - [813, 9585.37] + - - [11776, 8976, 1, 256] + - [813, 9560.44] + - - [256, 20992, 1, 1024] + - [811, 7490.75] + - - [20480, 8976, 1, 256] + - [821, 9610.8] + - - [5888, 8976, 1, 256] + - [792, 9565.3] + - - [256, 10496, 1, 1024] + - [783, 6632.06] + - - [21248, 8976, 1, 256] + - [813, 9755.87] + - - [5120, 8976, 1, 256] + - [821, 9244.69] + - - [7168, 8976, 1, 256] + - [813, 9388.52] + - - [2048, 1536, 1, 768] + - [802, 9446.14] + - - [256, 8192, 1, 1024] + - [807, 6948.99] + - - [4096, 8976, 1, 256] + - [812, 9116.04] + - - [3328, 8976, 1, 256] + - [805, 9434.65] + - - [1280, 8976, 1, 256] + - [803, 9129.9] + - - [2560, 8976, 1, 256] + - [800, 9199.58] + - - [3072, 8976, 1, 256] + - [815, 8963.7] + - - [256, 11776, 1, 1024] + - [793, 6869.9] + - - [18688, 8976, 1, 256] + - [821, 9726.31] + - - [15104, 8976, 1, 256] + - [821, 9715.81] + - - [23552, 8976, 1, 256] + - [813, 9648.52] + - - [6144, 8976, 1, 256] + - [821, 9339.9] + - - [12544, 8976, 1, 256] + - [821, 9654.55] + - - [256, 11264, 1, 1024] + - [794, 6815.08] + - - [2048, 114, 1, 512] + - [825, 4583.6] + - - [4352, 8976, 1, 256] + - [805, 9471.5] + - - [15360, 8976, 1, 256] + - [821, 9583.87] + - - [256, 31488, 1, 1024] + - [820, 8438.11] + - - [28672, 8976, 1, 256] + - [813, 9688.95] + - - [256, 18176, 1, 1024] + - [789, 7405.19] + - - [9728, 8976, 1, 256] + - [821, 9524.25] + - - [256, 2816, 1, 1024] + - [785, 5405.76] + - - [256, 18944, 1, 1024] + - [789, 7503.51] + - - [256, 3584, 1, 1024] + - [788, 6107.25] + - - [7936, 8976, 1, 256] + - [801, 9608.41] + - - [19712, 8976, 1, 256] + - [821, 9736.35] + - - [256, 14848, 1, 1024] + - [794, 7163.52] + - - [256, 8448, 1, 1024] + - [794, 6372.66] + - - [256, 6400, 1, 1024] + - [808, 6395.81] + - - [256, 6144, 1, 1024] + - [819, 6490.32] + - - [9472, 8976, 1, 256] + - [798, 9610.02] + - - [256, 9984, 1, 1024] + - [781, 6484.85] + - - [684, 8976, 1, 256] + - [790, 8128.63] + - - [20992, 8976, 1, 256] + - [813, 9689.75] + - - [2048, 684, 1, 512] + - [797, 7241.88] + - - [2048, 114, 1, 768] + - [823, 4872.56] + - - [8960, 8976, 1, 256] + - [796, 9603.45] + - - [2048, 1536, 1, 512] + - [799, 8830.21] + - - [256, 3328, 1, 1024] + - [787, 5612.65] + - - [33536, 8976, 1, 256] + - [813, 9797.81] + - - [2048, 8976, 1, 256] + - [813, 8975.56] + - - [10496, 8976, 1, 256] + - [804, 9654.53] + - - [256, 5376, 1, 1024] + - [822, 5626.44] + - - [256, 21248, 1, 1024] + - [791, 7525.55] + - - [256, 13312, 1, 1024] + - [789, 6767.21] + - - [16128, 8976, 1, 256] + - [813, 9715.67] + - - [2304, 8976, 1, 256] + - [786, 9433.93] + - - [256, 4864, 1, 1024] + - [776, 5743.65] + - - [17152, 8976, 1, 256] + - [821, 9709.04] + - - [15872, 8976, 1, 256] + - [821, 9657.67] + - - [9984, 8976, 1, 256] + - [798, 9639.84] + - - [256, 14592, 1, 1024] + - [810, 7224.02] + - - [256, 33536, 1, 1024] + - [817, 8147.41] + - - [11264, 8976, 1, 256] + - [813, 9510.06] + - - [31488, 8976, 1, 256] + - [821, 9799.41] + - - [256, 20480, 1, 1024] + - [794, 7498.3] + - - [44505, 8976, 1, 256] + - [805, 9804.88] + - - [13568, 8976, 1, 256] + - [813, 9680.34] + - - [256, 11520, 1, 1024] + - [793, 6805.36] + - - [256, 7936, 1, 1024] + - [809, 6971.87] + - - [2048, 256, 1, 768] + - [779, 7129.23] + - - [256, 4608, 1, 1024] + - [777, 5463.01] + - - [256, 2304, 1, 1024] + - [784, 4842.79] + - - [256, 2560, 1, 1024] + - [785, 5309.35] + - - [2816, 8976, 1, 256] + - [796, 9409.66] - null